# Testing Table Format Token Lengths

In [1]:
# Setup

import os
import re
import json
import uuid
import yaml
import tiktoken
import requests
import pandas as pd
from io import StringIO
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import markdown
import chromadb
from chromadb.config import Settings

In [2]:
# Data

data_types = ['txt', 'html', 'json', 'md', 'yaml']
file_df = pd.read_csv(r"..\Data\Other\html_data.csv")
file_df.head(3)

Unnamed: 0,Index,URL,Num of Tables,Pictures of Tables,Dynamic Dropdown,Note
0,1,https://support.microsoft.com/en-us/office/int...,1,0,No,-
1,2,https://support.microsoft.com/en-us/office/for...,1,0,No,-
2,3,https://support.microsoft.com/en-us/office/vid...,1,0,No,-


In [3]:
# Data File Related

def fetch_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve HTML. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def get_all_urls():
    urls = file_df['URL'].tolist()
    return urls

In [4]:
# Chunking Related

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
# Extract DF from Formats

def extract_tables_from_html(html_content: str) -> pd.DataFrame:
    """Extract tables from HTML content and return them as a list of pandas DataFrames."""
    soup = BeautifulSoup(html_content, 'html.parser')
    html_io = StringIO(html_content)
    return pd.read_html(html_io)[0]
    # return [pd.read_html(str(table))[0] for table in soup.find_all('table')]

def extract_tables_from_json(json_content: str) -> pd.DataFrame:
    """Extract tables from JSON content and return them as a list of pandas DataFrames."""
    data = json.loads(json_content)
    return pd.DataFrame(list(data.values())[0])
    # return [pd.DataFrame(table_data) for table_data in data.values()]

def extract_tables_from_md(md_content: str) -> pd.DataFrame:
    """Convert Markdown content to HTML and extract tables from it."""
    html_content = markdown.markdown(md_content)
    return extract_tables_from_html(html_content)
    # return extract_tables_from_html(html_content)

def extract_tables_from_file(html_text: str, source: str) -> pd.DataFrame:
    """Extract tables from HTML content string based on its extension."""

    if source == "html":
        return extract_tables_from_html(html_text)
    elif source.endswith('.json'):
        return extract_tables_from_json(html_text)
    elif source.endswith('.md'):
        return extract_tables_from_md(html_text)
    else:
        raise ValueError("Unsupported file format")

In [6]:
# Convert DF to Formats

def dataframe_to_text(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to a text format."""
    return dataframe.to_string(index=False)

def dataframe_to_html(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to HTML format."""
    return dataframe.to_html(index=False)

def dataframe_to_json(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to JSON format."""
    return dataframe.to_json(orient='records', indent=4)

def dataframe_to_md(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to Markdown format by first converting to HTML."""
    return markdown.markdown(dataframe.to_html(index=False))

def dataframe_to_yaml(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to YAML format."""
    return yaml.dump(dataframe.to_dict(orient='records'), default_flow_style=False)

def print_format_from_table(print_format: str, dataframe: pd.DataFrame) -> None:
    """Print or save a pandas DataFrame in the specified format."""
    format_functions = {
        'html': dataframe_to_html,
        'json': dataframe_to_json,
        'md': dataframe_to_md,
        'txt': dataframe_to_text,
        'yaml': dataframe_to_yaml
    }
    
    if print_format not in format_functions:
        raise ValueError("Unsupported save format")

    print_content = format_functions[print_format](dataframe)
    return print_content

In [7]:
all_urls = get_all_urls()
print(f"all_urls: {all_urls}")

one_url = all_urls[0]
print(f"one_url: {one_url}")

all_urls: ['https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7', 'https://support.microsoft.com/en-us/office/format-a-table-e6e77bc6-1f4e-467e-b818-2e2acc488006', 'https://support.microsoft.com/en-us/office/video-get-started-with-table-relationships-728d53ff-f332-4ac6-9382-574ee271500a', 'https://support.microsoft.com/en-us/office/resize-a-table-column-or-row-9340d478-21be-4392-81cf-488f7bbd6715', 'https://support.microsoft.com/en-us/office/using-structured-references-with-excel-tables-f5ed2452-2337-4f71-bed3-c8ae6d2b276e', 'https://support.microsoft.com/en-us/office/how-can-i-merge-two-or-more-tables-c80a9fce-c1ab-4425-bb96-497dd906d656', 'https://support.microsoft.com/en-us/office/create-a-form-that-contains-a-subform-a-one-to-many-form-ddf3822f-8aba-49cb-831a-1e74d6f5f06b', 'https://support.microsoft.com/en-us/office/relationships-between-tables-in-a-data-model-533dc2b6-9288-4363-9538-8ea6e469112b', 'https://support.microsoft.com/e

In [8]:
raw_html = fetch_html(one_url)
print(f"raw_html: {raw_html}")

raw_html: 
<!DOCTYPE html>
<html lang="en-US" dir="ltr">
<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<title>Introduction to tables - Microsoft Support</title>
	
	
		<link rel="canonical" href="https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />
		
			<link rel="alternate" hreflang="ar-SA" href="https://support.microsoft.com/ar-sa/topic/%D9%85%D9%82%D8%AF%D9%85%D8%A9-%D8%AD%D9%88%D9%84-%D8%A7%D9%84%D8%AC%D8%AF%D8%A7%D9%88%D9%84-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />
			<link rel="alternate" hreflang="bg-BG" href="https://support.microsoft.com/bg-bg/topic/%D0%B2%D1%8A%D0%B2%D0%B5%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5-%D0%B2-%D1%82%D0%B0%D0%B1%D0%BB%D0%B8%D1%86%D0%B8%D1%82%D0%B5-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />
			<link rel="alternate" hreflang="cs-CZ" href="https://support.microsoft.com/cs-cz/topic/%C3%BAvod-do-tabulek-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7" />
			<link 

In [9]:
num_tokens_from_string(raw_html, "cl100k_base")

49300

In [10]:
soup = BeautifulSoup(raw_html, 'html.parser')

In [11]:
# Find all table elements
tables = soup.find_all('table')
print(f"Num of Table Elements: {len(tables)}")

Num of Table Elements: 2


In [12]:
# Find only parent table elements
top_level_tables = [table for table in tables if not table.find_parent('table')]
print(f"Num of Parent Table Elements: {len(top_level_tables)}")

Num of Parent Table Elements: 1


In [13]:
# Filter out tables that are just wrappers for images or have insufficient content
tables_with_content = []
for table in top_level_tables:
    # Count all the elements within each table
    all_elements = table.find_all(True)
    # Count all the img tags within each table
    img_elements = table.find_all('img')
    
    # Count non-empty cells and img wrappers
    non_empty_cells = [cell for cell in table.find_all('td') if cell.get_text(strip=True)]
    img_wrappers = [cell for cell in table.find_all('td') if cell.find('img')]
    
    # Adjust condition to filter out tables that are primarily image wrappers or have insufficient content
    if len(non_empty_cells) > 0 and len(all_elements) - len(img_elements) > len(img_wrappers):
        tables_with_content.append(table)
print(f"Num of Filtered Parent Table Elements: {len(tables_with_content)}")

Num of Filtered Parent Table Elements: 1


In [18]:
table_summaries = []
output_format = "html"
for i, table in enumerate(tables_with_content):
    print(f"Table {i}")
    
    soup_table_df = extract_tables_from_file(str(table), "html")
    print(f"soup_table_df: {soup_table_df}")
    
    for output_type in data_types:
        soup_table_format = print_format_from_table(output_type, soup_table_df)
        print(f"soup_table_format: {soup_table_format}")

        tok_cnt = num_tokens_from_string(soup_table_format, "cl100k_base")
        print(f"tok_cnt for {output_type}: {tok_cnt}")

Table 0
soup_table_df:                                         If you enter:  \
0   If you enter: Access creates a field with a da...   
1                                       If you enter:   
2                                                John   
3   http://www.contoso.com You can use any valid I...   
4                                                   1   
5                                               50000   
6                                            50000.99   
7                                           50000.389   
8   12/67 The date and time formats recognized are...   
9                                   December 31, 2016   
10                                           10:50:23   
11                                           10:50 am   
12                                              17:50   
13  $12.50 The currency symbol recognized is that ...   
14                                              21.75   
15                                            123.00%   
16      