In [99]:
import os
import json
import yaml
import requests
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import markdown

In [91]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [109]:
# Function to make API call and create heatmap
def make_openai_call(payload):
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {openai_api_key}',
    }
    response = requests.post('https://api.openai.com/v1/chat/completions', json=payload, headers=headers)
    
    result = response.json()

    input_tokens = result['usage']['prompt_tokens']
    output_tokens = result['usage']['completion_tokens']
    result_text = result['choices'][0]['message']['content']

    return input_tokens, output_tokens, result_text

In [77]:
def extract_tables_from_html(html_content: str) -> list[pd.DataFrame]:
    """Extract tables from HTML content and return them as a list of pandas DataFrames."""
    soup = BeautifulSoup(html_content, 'html.parser')
    return [pd.read_html(str(table))[0] for table in soup.find_all('table')]

def extract_tables_from_json(json_content: str) -> list[pd.DataFrame]:
    """Extract tables from JSON content and return them as a list of pandas DataFrames."""
    data = json.loads(json_content)
    return [pd.DataFrame(table_data) for table_data in data.values()]

def extract_tables_from_md(md_content: str) -> list[pd.DataFrame]:
    """Convert Markdown content to HTML and extract tables from it."""
    html_content = markdown.markdown(md_content)
    return extract_tables_from_html(html_content)

def extract_tables_from_file(file_path: str) -> list[pd.DataFrame]:
    """Extract tables from a file based on its extension."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    if file_path.endswith('.html'):
        return extract_tables_from_html(content)
    elif file_path.endswith('.json'):
        return extract_tables_from_json(content)
    elif file_path.endswith('.md'):
        return extract_tables_from_md(content)
    else:
        raise ValueError("Unsupported file format")

In [93]:
def dataframe_to_text(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to a text format."""
    return dataframe.to_string(index=False)

def dataframe_to_html(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to HTML format."""
    return dataframe.to_html(index=False)

def dataframe_to_json(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to JSON format."""
    return dataframe.to_json(orient='records', indent=4)

def dataframe_to_md(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to Markdown format by first converting to HTML."""
    return markdown.markdown(dataframe.to_html(index=False))

def dataframe_to_yaml(dataframe: pd.DataFrame) -> str:
    """Convert a pandas DataFrame to YAML format."""
    return yaml.dump(dataframe.to_dict(orient='records'), default_flow_style=False)

def print_format_from_table(print_format: str, dataframe: pd.DataFrame) -> None:
    """Print or save a pandas DataFrame in the specified format."""
    format_functions = {
        'html': dataframe_to_html,
        'json': dataframe_to_json,
        'md': dataframe_to_md,
        'txt': dataframe_to_text,
        'yaml': dataframe_to_yaml
    }
    
    if print_format not in format_functions:
        raise ValueError("Unsupported save format")

    print_content = format_functions[print_format](dataframe)
    return print_content

In [112]:
data_types = ['txt', 'html', 'json', 'md', 'yaml']
html_file_path = r"C:\X\Dev\PythonProjects\Copilot\Data\HTML\test2.html"

In [113]:
# Example usage
try:
    dataframes = extract_tables_from_file(html_file_path)
    else:
        print("No tables found.")
except ValueError as e:
    print(e)

  return [pd.read_html(str(table))[0] for table in soup.find_all('table')]
  return [pd.read_html(str(table))[0] for table in soup.find_all('table')]


In [117]:
for output_type in data_types:
    print_content = print_format_from_table(output_type, dataframes[0])
    print(f"{output_type}\n{print_content}\n")

    payload = {
        'messages': [
            {
                'role': 'system', 
                'content': f"You are a helpful assistant"
            },
            {
                'role': 'user', 

                # html1: All questions about type based on entered value are correct
                # 'content': f"What type of field would be created if i enter orange?\n{print_content}"

                # html2 table 1 and 2: All questions about type based on entered value are correct
                'content': f"What are the product ids of 20052?\n{print_content}"
                # 'content': f"What is the sale date and order id of C6077B?\n{print_content}"
                # 'content': f"What are the order ids for 885?\n{print_content}"
            }
        ],
        'model': "gpt-3.5-turbo",
        'max_tokens': 256,
        "temperature": 0.0,
        'seed': 48
    }
    input_tokens, output_tokens, response = make_openai_call(payload)
    print(f"input tokens: {input_tokens}\noutput tokens: {output_tokens}\nresponse: {response}\n")

txt
 Order ID Sale Date Product ID
    20050    2/2/14     C6077B
    20050    2/2/14    C9250LB
    20051    2/2/14      M115A
    20052    2/3/14      A760G
    20052    2/3/14      E3331
    20052    2/3/14     SP1447
    20053    2/3/14       L88M
    20054    2/4/14    S1018MM
    20055    2/5/14     C6077B
    20056    2/6/14      E3331
    20056    2/6/14      D534X

input tokens: 212
output tokens: 12
response: The product ID of 20052 is A760G.

html
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Order ID</th>
      <th>Sale Date</th>
      <th>Product ID</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>20050</td>
      <td>2/2/14</td>
      <td>C6077B</td>
    </tr>
    <tr>
      <td>20050</td>
      <td>2/2/14</td>
      <td>C9250LB</td>
    </tr>
    <tr>
      <td>20051</td>
      <td>2/2/14</td>
      <td>M115A</td>
    </tr>
    <tr>
      <td>20052</td>
      <td>2/3/14</td>
      <td>A760G</td>
    </tr>
    <tr>
    