In [1]:
import os
import json
import yaml
import requests
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import markdown

In [2]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [50]:
data_types = ['txt', 'html', 'json', 'md', 'yaml']
file_df = pd.read_csv(r"..\Data\Other\html_data.csv")
file_df.head(3)

Unnamed: 0,Index,URL,Num of Tables,Pictures of Tables,Dynamic Dropdown,Note
0,1,https://support.microsoft.com/en-us/office/int...,1,0,No,-
1,2,https://support.microsoft.com/en-us/office/for...,1,0,No,-
2,3,https://support.microsoft.com/en-us/office/vid...,1,0,No,-


In [30]:
def fetch_html(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve HTML. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [49]:
def get_url_from_index(search_index):
    row = file_df.loc[file_df['Index'] == search_index]
    if not row.empty:
        url_value = row['URL'].iloc[0]
        print("URL:", url_value)
        return url_value
    else:
        print("No URL found for the given condition.")
        return None

In [3]:
def make_openai_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {openai_api_key}',
    }
    response = requests.post('https://api.openai.com/v1/chat/completions', json=payload, headers=headers)
    
    result = response.json()

    input_tokens = result['usage']['prompt_tokens']
    output_tokens = result['usage']['completion_tokens']
    result_text = result['choices'][0]['message']['content']

    return input_tokens, output_tokens, result_text

In [44]:
def extract_tables_from_html(html_content: str) -> list[pd.DataFrame]:
    """Extract tables from HTML content and return them as a list of pandas DataFrames."""
    soup = BeautifulSoup(html_content, 'html.parser')
    return [pd.read_html(str(table))[0] for table in soup.find_all('table')]

def extract_tables_from_json(json_content: str) -> list[pd.DataFrame]:
    """Extract tables from JSON content and return them as a list of pandas DataFrames."""
    data = json.loads(json_content)
    return [pd.DataFrame(table_data) for table_data in data.values()]

def extract_tables_from_md(md_content: str) -> list[pd.DataFrame]:
    """Convert Markdown content to HTML and extract tables from it."""
    html_content = markdown.markdown(md_content)
    return extract_tables_from_html(html_content)

def extract_tables_from_file(html_text: str, source: str) -> list[pd.DataFrame]:
    """Extract tables from HTML content string based on its extension."""

    if source == "html":
        return extract_tables_from_html(html_text)
    elif source.endswith('.json'):
        return extract_tables_from_json(html_text)
    elif source.endswith('.md'):
        return extract_tables_from_md(html_text)
    else:
        raise ValueError("Unsupported file format")

In [5]:
def dataframe_to_text(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to a text format."""
    return dataframe.to_string(index=False)

def dataframe_to_html(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to HTML format."""
    return dataframe.to_html(index=False)

def dataframe_to_json(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to JSON format."""
    return dataframe.to_json(orient='records', indent=4)

def dataframe_to_md(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to Markdown format by first converting to HTML."""
    return markdown.markdown(dataframe.to_html(index=False))

def dataframe_to_yaml(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to YAML format."""
    return yaml.dump(dataframe.to_dict(orient='records'), default_flow_style=False)

def print_format_from_table(print_format: str, dataframe: pd.DataFrame) -> None:
    """Print or save a pandas DataFrame in the specified format."""
    format_functions = {
        'html': dataframe_to_html,
        'json': dataframe_to_json,
        'md': dataframe_to_md,
        'txt': dataframe_to_text,
        'yaml': dataframe_to_yaml
    }
    
    if print_format not in format_functions:
        raise ValueError("Unsupported save format")

    print_content = format_functions[print_format](dataframe)
    return print_content

In [62]:
try:
    fetched_url = get_url_from_index(1)
    if fetched_url:
        html = fetch_html(fetched_url)
        if html:
            # print(html)
            dataframes = extract_tables_from_file(html, "html")
except ValueError as e:
    print(e)

URL: https://support.microsoft.com/en-us/office/introduction-to-tables-78ff21ea-2f76-4fb0-8af6-c318d1ee0ea7


  return [pd.read_html(str(table))[0] for table in soup.find_all('table')]
  return [pd.read_html(str(table))[0] for table in soup.find_all('table')]


In [63]:
dataframes[0]

Unnamed: 0,If you enter:,Access creates a field with a data type of:
0,If you enter: Access creates a field with a da...,
1,If you enter:,Access creates a field with a data type of:
2,John,Short Text
3,http://www.contoso.com You can use any valid I...,Hyperlink
4,1,"Number, Long Integer"
5,50000,"Number, Long Integer"
6,50000.99,"Number, Double"
7,50000.389,"Number, Double"
8,12/67 The date and time formats recognized are...,Date/Time
9,"December 31, 2016",Date/Time


In [67]:
for output_type in data_types:
    print_content = print_format_from_table(output_type, dataframes[0])
    print(f"{output_type}")
    # print(f"{print_content}")

    payload = {
        'messages': [
            {
                'role': 'system', 
                
                # Default
                'content': f"You are a helpful assistant"

                # Seems to give worse results
                # 'content': f"You are a helpful assistant that can read a table. ONLY respond with the answer."
            },
            {
                'role': 'user', 

                # 1: All questions about type based on entered value are correct
                # 'content': f"What type of field would be created if i enter orange?\n{print_content}"
                'content': f"What type of field would be created if i enter 2387456283746532?\n{print_content}"

                # html2 table 1 and 2: All questions about type based on entered value are correct
                # 'content': f"What are the product ids of 20052?\n{print_content}"
                # 'content': f"What is the sale date and order id of C6077B?\n{print_content}"
                # 'content': f"What are the order ids for 885?\n{print_content}"
            }
        ],
        'model': "gpt-3.5-turbo",
        'max_tokens': 128,
        "temperature": 0.0,
        'seed': 48
    }
    input_tokens, output_tokens, response = make_openai_call(payload)
    print(f"input tokens: {input_tokens}")
    print(f"output tokens: {output_tokens}")
    print(f"response: {response}\n")

txt
input tokens: 531
output tokens: 29
response: If you enter the number 2387456283746532 in Access, it would create a field with a data type of Number, Double.

html
input tokens: 804
output tokens: 37
response: Based on the table you provided, if you were to enter the number 2387456283746532, Access would create a field with a data type of "Number, Double".

json
input tokens: 814
output tokens: 28
response: If you enter the number 2387456283746532, Access would create a field with a data type of "Number, Double".

md
input tokens: 804
output tokens: 38
response: Based on the information provided, if you were to enter the number 2387456283746532 into Access, it would create a field with a data type of "Number, Double".

yaml
input tokens: 776
output tokens: 29
response: If you enter the number 2387456283746532 in Access, it would create a field with a data type of Number, Double.

