# Overall

## Setup

In [1]:
import os
import re
import glob
import json
import uuid
import yaml
import tiktoken
import requests
import pandas as pd
from io import StringIO
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import markdown
import chromadb
from chromadb.config import Settings

In [2]:
# Vars

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
together_api_key = os.getenv('TOGETHER_API_KEY')

## Data

In [3]:
def fetch_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve HTML. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [4]:
# Pull all URLs from file
file_df = pd.read_csv(r"..\Data\Other\html_data.csv")
print(f"CSV Data\n{file_df.head(3)}\n")

all_urls = file_df['URL'].tolist()
print(f"All URLs\n{all_urls}")

CSV Data
   Index                                                URL  Num of Tables  \
0      1  https://support.microsoft.com/en-us/office/int...              1   
1      2  https://support.microsoft.com/en-us/office/for...              1   
2      3  https://support.microsoft.com/en-us/office/vid...              1   

   Pictures of Tables Dynamic Dropdown Note  
0                   0               No    -  
1                   0               No    -  
2                   0               No    -  

All URLs
['https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'https://support.microsoft.com/en-us/office/format-a-table-e6e77bc6-1f4e-467e-b818-2e2acc488006', 'https://support.microsoft.com/en-us/office/video-get-started-with-table-relationships-728d53ff-f332-4ac6-9382-574ee271500a', 'https://support.microsoft.com/en-us/office/resize-a-table-column-or-row-9340d478-21be-4392-81cf-488f7bbd6715', 'https://support.microsoft.com/en-us/offi

In [5]:
# Save HTML content from each URL to a file
html_output_path = r"..\Data\HTML"
for index, single_url in enumerate(all_urls, start=1):
    html_content = fetch_html(single_url)
    if html_content:
        file_name = f"page_{index}.html"
        file_path = os.path.join(html_output_path, file_name)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(html_content)
        print(f"HTML content saved to {file_path}")

HTML content saved to ..\Data\HTML\page_1.html
HTML content saved to ..\Data\HTML\page_2.html
HTML content saved to ..\Data\HTML\page_3.html
HTML content saved to ..\Data\HTML\page_4.html
HTML content saved to ..\Data\HTML\page_5.html
HTML content saved to ..\Data\HTML\page_6.html
HTML content saved to ..\Data\HTML\page_7.html
HTML content saved to ..\Data\HTML\page_8.html
HTML content saved to ..\Data\HTML\page_9.html
HTML content saved to ..\Data\HTML\page_10.html
HTML content saved to ..\Data\HTML\page_11.html
HTML content saved to ..\Data\HTML\page_12.html
HTML content saved to ..\Data\HTML\page_13.html
HTML content saved to ..\Data\HTML\page_14.html
HTML content saved to ..\Data\HTML\page_15.html
HTML content saved to ..\Data\HTML\page_16.html
HTML content saved to ..\Data\HTML\page_17.html
HTML content saved to ..\Data\HTML\page_18.html
Failed to retrieve HTML. Status code: 403


## Functions

In [6]:
# LLM API Functions

def make_openai_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {openai_api_key}',
    }
    response = requests.post('https://api.openai.com/v1/chat/completions', json=payload, headers=headers)
    
    result = response.json()

    input_tokens = result['usage']['prompt_tokens']
    output_tokens = result['usage']['completion_tokens']
    result_text = result['choices'][0]['message']['content']

    return input_tokens, output_tokens, result_text
    
def make_together_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {together_api_key}',
    }
    # print(headers)
    response = requests.post('https://api.together.xyz/v1/chat/completions', json=payload, headers=headers)
    # print(response)
    
    result = response.json()
    # print(result)

    result_text = result['choices'][0]['message']['content']
    # print(result_text)

    return result_text

In [7]:
# Table Related

def process_html_for_tables(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Find all table elements
    tables = soup.find_all('table')
    print(f"Num of Table Elements: {len(tables)}")
    
    top_level_tables = [table for table in tables if not table.find_parent('table')]
    print(f"Num of Parent Table Elements: {len(top_level_tables)}")

    # Filter out tables that are just wrappers for images or have insufficient content
    tables_with_content = []
    for table in top_level_tables:
        # Count all the elements within each table
        all_elements = table.find_all(True)
        # Count all the img tags within each table
        img_elements = table.find_all('img')
        
        # Count non-empty cells and img wrappers
        non_empty_cells = [cell for cell in table.find_all('td') if cell.get_text(strip=True)]
        img_wrappers = [cell for cell in table.find_all('td') if cell.find('img')]
        
        # Adjust condition to filter out tables that are primarily image wrappers or have insufficient content
        if len(non_empty_cells) > 0 and len(all_elements) - len(img_elements) > len(img_wrappers):
            tables_with_content.append(table)
    print(f"Num of Filtered Parent Table Elements: {len(tables_with_content)}\n")

    return tables_with_content

def extract_table_data(table):
    headers = [th.get_text(strip=True) for th in table.find_all('th')]
    rows = [[td.get_text(strip=True) for td in tr.find_all('td')] for tr in table.find_all('tr')]
    
    # Clean rows from empty values or rows
    cleaned_rows = [row for row in rows if any(row)]
    
    # Adjusting for tables without header rows, using column indexes as headers
    if not headers and cleaned_rows:
        headers = [f"Column {i+1}" for i in range(len(cleaned_rows[0]))]
    
    return headers, cleaned_rows
def convert_to_json(headers, rows):
    data_list = []
    for row in rows:
        row_data = {headers[i]: cell for i, cell in enumerate(row)}
        data_list.append(row_data)
    return json.dumps(data_list, indent=2)
def convert_to_markdown(headers, rows):
    md_string = "| " + " | ".join(headers) + " |\n"
    md_string += "| " + " | ".join(["---"] * len(headers)) + " |\n"
    for row in rows:
        md_string += "| " + " | ".join(row) + " |\n"
    return md_string
def convert_to_yaml(headers, rows):
    data_list = []
    for row in rows:
        row_data = {headers[i]: cell for i, cell in enumerate(row)}
        data_list.append(row_data)
    return yaml.dump(data_list)
def convert_to_txt(headers, rows):
    max_lengths = [max(len(headers[i]), max(len(row[i]) for row in rows)) for i in range(len(headers))]
    txt_string = " | ".join(headers[i].ljust(max_lengths[i]) for i in range(len(headers))) + "\n"
    txt_string += "-+-".join("-" * max_lengths[i] for i in range(len(headers))) + "\n"
    for row in rows:
        txt_string += " | ".join(row[i].ljust(max_lengths[i]) for i in range(len(row))) + "\n"
    return txt_string

def table_to_format(table_text: str, source_format: str, output_format: str) -> pd.DataFrame:
    """Extract tables from HTML content string based on its extension."""

    # print(f"source_format: {source_format} | output_format: {output_format}\ntable_text\n{table_text}\n")

    headers, rows = extract_table_data(table_text)  # Using the first suitable table

    if output_format == 'json':
        return convert_to_json(headers, rows)
    elif output_format == 'md' or output_format == 'markdown':
        return convert_to_markdown(headers, rows)
    elif output_format == 'yaml':
        return convert_to_yaml(headers, rows)
    elif output_format == 'txt':
        return convert_to_txt(headers, rows)
    else:
        return "Unsupported format requested."
    

In [8]:
# Token Related

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [9]:
# Chunking Related

def chunk_text_default(text, chunk_size=1024):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size
    return chunks

def chunk_text_with_overlap(text, chunk_size=1024, overlap_percentage=10):
    # Example chunking logic with overlap, adjust as necessary
    chunks = []
    overlap_size = int(chunk_size * (overlap_percentage / 100))
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += (chunk_size - overlap_size)
    return chunks

## Approaches

### Single Table - NonRAG - Raw HTML

In [42]:
file_name = "page_1.html"
table_num = 0
user_query = f"""
what data type is orange
"""

file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{table_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 649 tokens

input tokens: 692
output tokens: 10

response: Orange would be considered as Short Text data type.


### Single Table - NonRAG - Formatted HTML to MD/JSON/YAML

In [40]:
file_name = "page_1.html"
table_num = 0
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what data type is orange
"""

file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw {input_format_type} Table")
# print(f"{table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected Raw {input_format_type} table contains: {num_tokens_in_html} tokens\n")

formmated_table_contents = table_to_format(table_contents, input_format_type, print_format_type)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_table_contents}\n")

num_tokens_in_formatted = num_tokens_from_string(str(formmated_table_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{formmated_table_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw html Table
Selected Raw html table contains: 649 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 590 tokens

input tokens: 633
output tokens: 12

response: Orange would be considered as Short Text data type in Access.


### Single Table - NonRAG - Summarize HTML

In [43]:
file_name = "page_1.html"
table_num = 0
input_format_type = f"html"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected Raw HTML table contains: {num_tokens_in_html} tokens\n")


payload = {
    'messages': [
        {
            'role': 'system',
            'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
        },
        {
            'role': 'user', 
            'content': f"{table_contents}"
        }
    ],
    'model': "openchat/openchat-3.5-1210",
    'max_tokens': 2048,
    "temperature": 0.0
}
soup_table_summary = make_together_call(payload)
print(f"Selected Raw HTML Table to Summary")
# print(f"{soup_table_summary}\n")

num_tokens_in_summary = num_tokens_from_string(str(soup_table_summary), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{formmated_table_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw html Table
Selected Raw html table contains: 649 tokens

Selected Raw html Table as Summary
Selected Formatted md table contains: 82 tokens

input tokens: 633
output tokens: 12

response: Orange would be considered as Short Text data type in Access.


### Single Table - RAG - Raw HTML

In [47]:
file_name = "page_1.html"
table_num = 0
user_query = f"""
what data type is orange
"""

file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")

# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(table_contents), 512)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)
# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 649 tokens

all_chunks
['<table aria-label="" class="banded">\n<tbody>\n<tr>\n<td>\n<table aria-label="" class="banded">\n<thead>\n<tr>\n<th>\n<p>\n<b class="ocpLegacyBold">If you enter:</b>\n</p>\n</th>\n<th>\n<p>\n<b class="ocpLegacyBold">Access creates a field with a data type of:</b>\n</p>\n</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>\n<p>John</p>\n</td>\n<td>\n<p>Short Text</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>\n<span class="ocpFictitious">http://www.contoso.com</span>\n</p>\n<p>You can use any valid Internet protocol prefix. For example, http://, https://, and m', 'ailto: are valid prefixes.</p>\n</td>\n<td>\n<p>Hyperlink</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>1</p>\n</td>\n<td>\n<p>Number, Long Integer</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>50,000</p>\n</td>\n<td>\n<p>Number, Long Integer</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>50,000.99</p>\n</td>\n<td>\n<p

### Single Table - RAG - Formatted HTML to MD/JSON/YAML

In [48]:
file_name = "page_1.html"
table_num = 0
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")


formmated_table_contents = table_to_format(table_contents, input_format_type, print_format_type)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_table_contents}\n")

num_tokens_in_formatted = num_tokens_from_string(str(formmated_table_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(formmated_table_contents), 512)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)
# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 649 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 590 tokens

all_chunks
['| If you enter: | Access creates a field with a data type of: |\n| --- | --- |\n| If you enter:Access creates a field with a data type of:JohnShort Texthttp://www.contoso.comYou can use any valid Internet protocol prefix. For example, http://, https://, and mailto: are valid prefixes.Hyperlink1Number, Long Integer50,000Number, Long Integer50,000.99Number, Double50000.389Number, Double12/67The date and time formats recognized are those of your user locale.Date/TimeDecember 31, 2016Date/Time10:50:23Date/Time10:', '50 amDate/Time17:50Date/Time$12.50The currency symbol recognized is that of your user locale.Currency21.75Number, Double123.00%Number, Double3.46E+03Number, Double | John | Short Text | http://www.contoso.comYou can use any valid Inte

### Single Table - RAG - Summarize HTML

In [49]:
file_name = "page_1.html"
table_num = 0
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw Table")
# print(f"{table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")


payload = {
    'messages': [
        {
            'role': 'system',
            'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
        },
        {
            'role': 'user', 
            'content': f"{table_contents}"
        }
    ],
    'model': "openchat/openchat-3.5-1210",
    'max_tokens': 2048,
    "temperature": 0.0
}
soup_table_summary = make_together_call(payload)
print(f"Selected Raw HTML Table to Summary")
# print(f"{soup_table_summary}\n")

num_tokens_in_summary = num_tokens_from_string(str(soup_table_summary), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(soup_table_summary), 512)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)
# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected table contains: 649 tokens

Selected Raw HTML Table to Summary
Selected Summary Table contains: 82 tokens

all_chunks
[' Title: Data Types in Access\n\nWhen you enter different types of data into Access, it creates fields with specific data types. For example, entering a name creates a Short Text field, a URL creates a Hyperlink field, and a number creates a Long Integer or Double field, depending on its value. Date and time values are stored in a Date/Time field, while currency values are stored as Currency.']

documents_list: [' Title: Data Types in Access\n\nWhen you enter different types of data into Access, it creates fields with specific data types. For example, entering a name creates a Short Text field, a URL creates a Hyperlink field, and a number creates a Long Integer or Double field, depending on its value. Date and time values are stored in a Date/

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


results:
{'ids': [['8ed420d4-dfa7-4d0d-a23b-70cc793b34a5']], 'distances': [[1.1767873764038086]], 'metadatas': [[{'chunk_num': 0, 'file_name': 'page_1.html', 'table_num': 0}]], 'embeddings': None, 'documents': [[' Title: Data Types in Access\n\nWhen you enter different types of data into Access, it creates fields with specific data types. For example, entering a name creates a Short Text field, a URL creates a Hyperlink field, and a number creates a Long Integer or Double field, depending on its value. Date and time values are stored in a Date/Time field, while currency values are stored as Currency.']], 'uris': None, 'data': None}
input tokens: 222
output tokens: 44

response: Based on the provided context, "orange" is not mentioned as a data type in the information given. The data types mentioned are Short Text, Hyperlink, Long Integer, Double, Date/Time, and Currency.


### Multiple Tables - NonRAG - Raw HTML

In [51]:
file_name = "page_1.html"
user_query = f"""
what data type is orange
"""

file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw Tables")
# print(f"{all_table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"ALL tables contain: {num_tokens_in_html} tokens\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{all_table_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw Tables
ALL tables contain: 651 tokens

input tokens: 694
output tokens: 10

response: Orange would be considered as Short Text data type.


### Multiple Table - NonRAG - Formatted HTML to MD/JSON/YAML

In [52]:
file_name = "page_1.html"
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw {input_format_type.upper()} Table")
# print(f"{all_table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw {input_format_type} table contains: {num_tokens_in_html} tokens\n")


formmated_tables_contents = []
for html_table in all_table_contents:
    formmated_table_content = table_to_format(table_contents, input_format_type, print_format_type)
    formmated_tables_contents.append(formmated_table_content)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_tables_contents}\n")

num_tokens_in_formatted = num_tokens_from_string(str(formmated_tables_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{formmated_tables_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw HTML Table
Selected Raw html table contains: 651 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 610 tokens

input tokens: 653
output tokens: 11

response: The data type for "orange" is Short Text.


### Multiple Table - NonRAG - Summarize HTML

In [53]:
file_name = "page_1.html"
input_format_type = f"html"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected Raw Table")
# print(f"{all_table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw HTML table contains: {num_tokens_in_html} tokens\n")


summarized_tables_contents = []
for html_table in all_table_contents:
    payload = {
        'messages': [
            {
                'role': 'system',
                'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
            },
            {
                'role': 'user', 
                'content': f"{html_table}"
            }
        ],
        'model': "openchat/openchat-3.5-1210",
        'max_tokens': 2048,
        "temperature": 0.0
    }
    soup_table_summary = make_together_call(payload)
    summarized_tables_contents.append(soup_table_summary)
print(f"Selected Raw HTML Table to Summary")
# print(f"{summarized_tables_contents}\n")

num_tokens_in_summary = num_tokens_from_string(str(summarized_tables_contents), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{summarized_tables_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected Raw HTML table contains: 651 tokens

Selected Raw HTML Table to Summary
Selected Summary Table contains: 84 tokens

input tokens: 127
output tokens: 75

response: Based on the context provided, the data type for "orange" is not explicitly mentioned. The context only provides examples of data types in Access such as Short Text, Hyperlink, Long Integer, Double, Date/Time, and Currency based on the type of data entered. Therefore, without further information, it is not possible to determine the specific data type for "orange."


### Multiple Table - RAG - Raw HTML

In [54]:
file_name = "page_1.html"
user_query = f"""
what data type is orange
"""

file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw Tables")
# print(f"{all_table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"ALL tables contain: {num_tokens_in_html} tokens\n")

# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the multiple tables
all_chunks = chunk_text_default(str(all_table_contents), 512)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)
# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw Tables
ALL tables contain: 651 tokens

all_chunks
['[<table aria-label="" class="banded">\n<tbody>\n<tr>\n<td>\n<table aria-label="" class="banded">\n<thead>\n<tr>\n<th>\n<p>\n<b class="ocpLegacyBold">If you enter:</b>\n</p>\n</th>\n<th>\n<p>\n<b class="ocpLegacyBold">Access creates a field with a data type of:</b>\n</p>\n</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>\n<p>John</p>\n</td>\n<td>\n<p>Short Text</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>\n<span class="ocpFictitious">http://www.contoso.com</span>\n</p>\n<p>You can use any valid Internet protocol prefix. For example, http://, https://, and ', 'mailto: are valid prefixes.</p>\n</td>\n<td>\n<p>Hyperlink</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>1</p>\n</td>\n<td>\n<p>Number, Long Integer</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>50,000</p>\n</td>\n<td>\n<p>Number, Long Integer</p>\n</td>\n</tr>\n<tr>\n<td>\n<p>50,000.99</p>\n</td>\n<td>\n<

### Multiple Table - RAG - Formatted HTML to MD/JSON/YAML

In [55]:
file_name = "page_1.html"
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected ALL Raw {input_format_type.upper()} Table")
# print(f"{all_table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw {input_format_type} table contains: {num_tokens_in_html} tokens\n")


formmated_tables_contents = []
for html_table in all_table_contents:
    formmated_table_content = table_to_format(table_contents, input_format_type, print_format_type)
    formmated_tables_contents.append(formmated_table_content)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_tables_contents}\n")

num_tokens_in_formatted = num_tokens_from_string(str(formmated_tables_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(formmated_tables_contents), 512)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)
# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected ALL Raw HTML Table
Selected Raw html table contains: 651 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 610 tokens

all_chunks
["['| If you enter: | Access creates a field with a data type of: |\\n| --- | --- |\\n| If you enter:Access creates a field with a data type of:JohnShort Texthttp://www.contoso.comYou can use any valid Internet protocol prefix. For example, http://, https://, and mailto: are valid prefixes.Hyperlink1Number, Long Integer50,000Number, Long Integer50,000.99Number, Double50000.389Number, Double12/67The date and time formats recognized are those of your user locale.Date/TimeDecember 31, 2016Date/Time10:50:23Date/Tim", 'e10:50 amDate/Time17:50Date/Time$12.50The currency symbol recognized is that of your user locale.Currency21.75Number, Double123.00%Number, Double3.46E+03Number, Double | John | Short Text | http://www.contoso.comYou 

### Multiple Table - RAG - Summarize HTML

In [56]:
file_name = "page_1.html"
input_format_type = f"html"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
print(f"Selected Raw Table")
# print(f"{all_table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(all_table_contents), "cl100k_base")
print(F"Selected Raw HTML table contains: {num_tokens_in_html} tokens\n")


summarized_tables_contents = []
for html_table in all_table_contents:
    payload = {
        'messages': [
            {
                'role': 'system',
                'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
            },
            {
                'role': 'user', 
                'content': f"{html_table}"
            }
        ],
        'model': "openchat/openchat-3.5-1210",
        'max_tokens': 2048,
        "temperature": 0.0
    }
    soup_table_summary = make_together_call(payload)
    summarized_tables_contents.append(soup_table_summary)
print(f"Selected Raw HTML Table to Summary")
# print(f"{summarized_tables_contents}\n")

num_tokens_in_summary = num_tokens_from_string(str(summarized_tables_contents), "cl100k_base")
print(F"Selected Summary Table contains: {num_tokens_in_summary} tokens\n")


# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = chunk_text_default(str(summarized_tables_contents), 512)
print(f"all_chunks\n{all_chunks}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks):
    documents_list.append(chunk)
    metadata_list.append({
        'file_name': file_name,
        'table_num': table_num,
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list}\nmetadata_list: {metadata_list}\nids_list: {ids_list}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)
# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected Raw HTML table contains: 651 tokens



Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


Selected Raw HTML Table to Summary
Selected Summary Table contains: 84 tokens

all_chunks
["[' Title: Data Types in Access\\n\\nWhen you enter different types of data into Access, it creates fields with specific data types. For example, entering a name creates a Short Text field, a URL creates a Hyperlink field, and a number creates a Long Integer or Double field, depending on its value. Date and time values are stored in a Date/Time field, while currency values are stored as Currency.']"]

documents_list: ["[' Title: Data Types in Access\\n\\nWhen you enter different types of data into Access, it creates fields with specific data types. For example, entering a name creates a Short Text field, a URL creates a Hyperlink field, and a number creates a Long Integer or Double field, depending on its value. Date and time values are stored in a Date/Time field, while currency values are stored as Currency.']"]
metadata_list: [{'file_name': 'page_1.html', 'table_num': 0, 'chunk_num': 0}]
ids_l

### Single Article - NonRAG - Raw HTML

In [57]:
file_name = "page_1.html"
user_query = f"""
what data type is orange
"""

file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
print(f"Selected Raw HTML Content")
# print(f"{html_content}\n")

num_tokens_in_html = num_tokens_from_string(str(html_content), "cl100k_base")
print(F"Selected table contains: {num_tokens_in_html} tokens\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{html_content}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Selected Raw HTML Content
Selected table contains: 49370 tokens



KeyError: 'usage'

### Single Article - NonRAG - Formatted HTML to MD/JSON/YAML

In [None]:
file_name = "page_1.html"
input_format_type = f"html"
print_format_type = f"md"
user_query = f"""
what data type is orange
"""

file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
all_table_contents = process_html_for_tables(html_content)
table_contents = all_table_contents[table_num]
print(f"Selected Raw {input_format_type} Table")
# print(f"{table_contents}\n")

num_tokens_in_html = num_tokens_from_string(str(table_contents), "cl100k_base")
print(F"Selected Raw {input_format_type} table contains: {num_tokens_in_html} tokens\n")

formmated_table_contents = table_to_format(table_contents, input_format_type, print_format_type)
print(f"Selected Raw {input_format_type} Table in {print_format_type}")
# print(f"{formmated_table_contents}\n")

num_tokens_in_formatted = num_tokens_from_string(str(formmated_table_contents), "cl100k_base")
print(F"Selected Formatted {print_format_type} table contains: {num_tokens_in_formatted} tokens\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{formmated_table_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw html Table
Selected Raw html table contains: 649 tokens

Selected Raw html Table in md
Selected Formatted md table contains: 590 tokens

input tokens: 633
output tokens: 12

response: Orange would be considered as Short Text data type in Access.


### Single Article - NonRAG - Summarize HTML

In [58]:
file_name = "page_1.html"
input_format_type = f"html"
user_query = f"""
what data type is orange
"""


file_path = os.path.join(r"..\Data\HTML", file_name)
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()
print(f"Selected Raw Article")
# print(f"{html_content}\n")

num_tokens_in_html = num_tokens_from_string(str(html_content), "cl100k_base")
print(F"Selected Raw HTML table contains: {num_tokens_in_html} tokens\n")


payload = {
    'messages': [
        {
            'role': 'system',
            'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
        },
        {
            'role': 'user', 
            'content': f"{html_content}"
        }
    ],
    'model': "openchat/openchat-3.5-1210",
    'max_tokens': 2048,
    "temperature": 0.0
}
soup_table_summary = make_together_call(payload)
print(f"Selected Raw HTML to Summary")
# print(f"{soup_table_summary}\n")

num_tokens_in_summary = num_tokens_from_string(str(soup_table_summary), "cl100k_base")
print(F"Selected Summary Article contains: {num_tokens_in_summary} tokens\n")


model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{formmated_table_contents}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Selected Raw Article
Selected Raw HTML table contains: 49370 tokens



KeyError: 'choices'

### Multiple Articles - RAG - Raw HTML

In [30]:
# Loading HTML from files
html_contents = []
directory = r"..\Data\HTML"
html_files = glob.glob(os.path.join(directory, '*.html'))
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        html_contents.append(html_content)
print(f"Selected ALL Articles: {len(html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(html_contents), "cl100k_base")
print(F"ALL tables contain: {num_tokens_in_html} tokens\n")

# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = []
for article_html_content in html_contents:
    chunk = chunk_text_default(str(article_html_content), 1024)
    all_chunks.append(chunk)
all_chunks_flat = [chunk for chunk_list in all_chunks for chunk in chunk_list]
print(f"all_chunks_flat: {len(all_chunks_flat)}\n{all_chunks_flat[:3]}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks_flat):
    documents_list.append(chunk)
    metadata_list.append({
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list[:3]}\nmetadata_list: {metadata_list[:3]}\nids_list: {ids_list[:3]}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)

Selected ALL Articles: 19
ALL tables contain: 1434912 tokens

all_chunks_flat: 4048
['\n\n<!DOCTYPE html>\n\n<html lang="en-US" dir="ltr">\n\n<head>\n\n\t<meta charset="utf-8" />\n\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0" />\n\n\t<title>Introduction to tables - Microsoft Support</title>\n\n\t\n\n\t\n\n\t\t<link rel="canonical" href="https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\n\n\t\t\t<link rel="alternate" hreflang="ar-SA" href="https://support.microsoft.com/ar-sa/topic/%D9%85%D9%82%D8%AF%D9%85%D8%A9-%D8%AD%D9%88%D9%84-%D8%A7%D9%84%D8%AC%D8%AF%D8%A7%D9%88%D9%84-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\t<link rel="alternate" hreflang="bg-BG" href="https://support.microsoft.com/bg-bg/topic/%D0%B2%D1%8A%D0%B2%D0%B5%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5-%D0%B2-%D1%82%D0%B0%D0%B1%D0%BB%D0%B8%D1%86%D0%B8%D1%82%D0%B5-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\t<link rel="alternate" hre

KeyboardInterrupt: 

In [28]:
user_query = f"""
what data type is orange
"""


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

results:
{'ids': [['84c28bba-b29a-4a45-b6d9-9d74e098d48a', '6faa70d5-59d0-4fc4-acd1-27d20a98a8fd', '41c7f93c-22cb-49d2-8617-ff3212827330']], 'distances': [[1.2600163221359253, 1.308709979057312, 1.3489704132080078]], 'metadatas': [[{'chunk_num': 4}, {'chunk_num': 210}, {'chunk_num': 11}]], 'embeddings': None, 'documents': [[" properties as follows: You set a field's data type in the table design grid, not in the Field Properties pane. A field's data type determines what other properties the field has. You must set a field's data type when you create the field. You can create a new field in Access by entering data in a new column in Datasheet view. When you create a field by entering data in Datasheet view, Access automatically assigns a data type for the field, based on the value that you enter. If no other data type is implied by your input, Access sets the data type to Text. If needed, you can change the data type by using the Ribbon. Examples of automatic data type detection The fol

### Multiple Articles - RAG - Formatted HTML to MD/JSON/YAML

In [32]:
# Data Loading

input_format_type = f"html"
print_format_type = f"md"

# Loading HTML from files
html_contents = []
directory = r"..\Data\HTML"
html_files = glob.glob(os.path.join(directory, '*.html'))
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content= file.read()
        html_content_soup = BeautifulSoup(html_content, 'html.parser')
        formmated_html_content = table_to_format(html_content_soup, input_format_type, print_format_type)
        html_contents.append(formmated_html_content)
print(f"Selected ALL Articles: {len(html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(html_contents), "cl100k_base")
print(F"ALL tables contain: {num_tokens_in_html} tokens\n")

# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = []
for article_html_content in html_contents:
    chunk = chunk_text_default(str(article_html_content), 1024)
    all_chunks.append(chunk)
all_chunks_flat = [chunk for chunk_list in all_chunks for chunk in chunk_list]
print(f"all_chunks_flat: {len(all_chunks_flat)}\n{all_chunks_flat[:3]}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks_flat):
    documents_list.append(chunk)
    metadata_list.append({
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list[:3]}\nmetadata_list: {metadata_list[:3]}\nids_list: {ids_list[:3]}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)

Selected ALL Articles: 19
ALL tables contain: 9776 tokens

all_chunks_flat: 44
['| If you enter: | Access creates a field with a data type of: |\n| --- | --- |\n| If you enter:Access creates a field with a data type of:JohnShort Texthttp://www.contoso.comYou can use any valid Internet protocol prefix. For example, http://, https://, and mailto: are valid prefixes.Hyperlink1Number, Long Integer50,000Number, Long Integer50,000.99Number, Double50000.389Number, Double12/67The date and time formats recognized are those of your user locale.Date/TimeDecember 31, 2016Date/Time10:50:23Date/Time10:50 amDate/Time17:50Date/Time$12.50The currency symbol recognized is that of your user locale.Currency21.75Number, Double123.00%Number, Double3.46E+03Number, Double | John | Short Text | http://www.contoso.comYou can use any valid Internet protocol prefix. For example, http://, https://, and mailto: are valid prefixes. | Hyperlink | 1 | Number, Long Integer | 50,000 | Number, Long Integer | 50,000.99 | 

In [34]:
user_query = f"""
what data type is orange
"""


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

results:
{'ids': [['7c6a63c8-5888-4227-bda7-6fdc48c295b9', '074780ce-a6d8-4c96-81ee-0a663357c5b5', 'afabc8a1-3bdf-4ea4-a888-e3caf6ab05ad']], 'distances': [[1.267984390258789, 1.267984390258789, 1.4264154434204102]], 'metadatas': [[{'chunk_num': 41}, {'chunk_num': 26}, {'chunk_num': 3}]], 'embeddings': None, 'documents': [['omerDiscounts.CustomerID |\n| Customers-Orders | one-to-many | Customers.CustomerID | Orders.CustomerID |\n| Unique Identifier for Each Table | Each table must have a single column that uniquely identifies each row in that table. This column is often referred to as the primary key. |\n| Unique Lookup Columns | The data values in the lookup column must be unique. In other words, the column can’t contain duplicates. In a Data Model, nulls and empty strings are equivalent to a blank, which is a distinct data value. This means that you can’t have multiple nulls in the lookup column. |\n| Compatible Data Types | The data types in the source column and lookup column must b

### Multiple Articles - RAG - Summarize HTML

In [36]:
# Data Loading

# Loading HTML from files
html_contents = []
directory = r"..\Data\HTML"
html_files = glob.glob(os.path.join(directory, '*.html'))
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content= file.read()
        html_contents.append(html_content)
print(f"Selected ALL Articles: {len(html_contents)}")
# print(f"{html_contents}\n")
num_tokens_in_html = num_tokens_from_string(str(html_contents), "cl100k_base")
print(F"ALL tables contain: {num_tokens_in_html} tokens\n")

# ChromaDB Setup
chroma_client = chromadb.Client(settings=Settings(allow_reset=True))
chroma_client.reset()
collection = chroma_client.create_collection(name="copilot")
# Chunk the single table
all_chunks = []
for article_html_content in html_contents:
    chunk = chunk_text_default(str(article_html_content), 1024)
    all_chunks.append(chunk)
all_chunks_flat = [chunk for chunk_list in all_chunks for chunk in chunk_list]
print(f"all_chunks_flat: {len(all_chunks_flat)}\n{all_chunks_flat[:3]}\n")
# Summarize each chunk
all_chunks_flat_summary = []
for single_chunk in all_chunks_flat:
    payload = {
        'messages': [
            {
                'role': 'system',
                'content': f"You are a helpful assistant that takes as input raw dump of a table and summarizes it. Create a title for the table and explain each row in a line of text."
            },
            {
                'role': 'user', 
                'content': f"{single_chunk}"
            }
        ],
        'model': "openchat/openchat-3.5-1210",
        'max_tokens': 2048,
        "temperature": 0.0
    }
    soup_table_summary = make_together_call(payload)
    all_chunks_flat_summary.append(soup_table_summary)
print(f"all_chunks_flat_summary: {len(all_chunks_flat_summary)}\n{all_chunks_flat_summary[:3]}\n")
# Add other data
documents_list = []
metadata_list = []
ids_list = []
for chunk_num, chunk in enumerate(all_chunks_flat_summary):
    documents_list.append(chunk)
    metadata_list.append({
        'chunk_num': chunk_num,
    })
    ids_list.append(str(uuid.uuid4()))
print(f"documents_list: {documents_list[:3]}\nmetadata_list: {metadata_list[:3]}\nids_list: {ids_list[:3]}\n")
# Insert into collection
collection.add(
    documents=documents_list,
    metadatas=metadata_list,
    ids=ids_list
)

Selected ALL Articles: 19
ALL tables contain: 1434912 tokens

all_chunks_flat: 4048
['\n\n<!DOCTYPE html>\n\n<html lang="en-US" dir="ltr">\n\n<head>\n\n\t<meta charset="utf-8" />\n\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0" />\n\n\t<title>Introduction to tables - Microsoft Support</title>\n\n\t\n\n\t\n\n\t\t<link rel="canonical" href="https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\n\n\t\t\t<link rel="alternate" hreflang="ar-SA" href="https://support.microsoft.com/ar-sa/topic/%D9%85%D9%82%D8%AF%D9%85%D8%A9-%D8%AD%D9%88%D9%84-%D8%A7%D9%84%D8%AC%D8%AF%D8%A7%D9%88%D9%84-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\t<link rel="alternate" hreflang="bg-BG" href="https://support.microsoft.com/bg-bg/topic/%D0%B2%D1%8A%D0%B2%D0%B5%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5-%D0%B2-%D1%82%D0%B0%D0%B1%D0%BB%D0%B8%D1%86%D0%B8%D1%82%D0%B5-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />\n\n\t\t\t<link rel="alternate" hre

KeyboardInterrupt: 

In [None]:
user_query = f"""
what data type is orange
"""


# Retrieval
results = collection.query(
    query_texts=user_query,
    n_results=3
)
print(f"results:\n{results}\n")

model_choice = "gpt-3.5-turbo"
system_prompt = f"""
You are a helpful assistant. Without any prior context answer the user's qustion based on the context provided only.
"""
user_prompt = f"""
{user_query}
Context
{results}
"""

payload = {
    'messages': [
        {
            'role': 'system', 
            'content': system_prompt.strip()
        },
        {
            'role': 'user',
            'content': user_prompt.strip()
        }
    ],
    'model': model_choice,
    'max_tokens': 256,
    "temperature": 0.0,
    'seed': 48
}
input_tokens, output_tokens, response = make_openai_call(payload)
print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\n")

print(f"response: {response}")

Num of Table Elements: 2
Num of Parent Table Elements: 1
Num of Filtered Parent Table Elements: 1

Selected Raw Table
Selected Raw HTML table contains: 651 tokens



Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


Selected Raw HTML Table to Summary
Selected Summary Table contains: 84 tokens

all_chunks
["[' Title: Data Types in Access\\n\\nWhen you enter different types of data into Access, it creates fields with specific data types. For example, entering a name creates a Short Text field, a URL creates a Hyperlink field, and a number creates a Long Integer or Double field, depending on its value. Date and time values are stored in a Date/Time field, while currency values are stored as Currency.']"]

documents_list: ["[' Title: Data Types in Access\\n\\nWhen you enter different types of data into Access, it creates fields with specific data types. For example, entering a name creates a Short Text field, a URL creates a Hyperlink field, and a number creates a Long Integer or Double field, depending on its value. Date and time values are stored in a Date/Time field, while currency values are stored as Currency.']"]
metadata_list: [{'file_name': 'page_1.html', 'table_num': 0, 'chunk_num': 0}]
ids_l