## Imports

In [1]:
import openai
import tiktoken

import pandas as pd
import json

from dateutil import parser as date_parser
from unidecode import unidecode

import os
import time
import re

## Table Generator

In [2]:
class TableGenerator_JSON():
    SYSTEM_MSG = "You are a retriever of facts."
    
    KEYS_TEMPLATE = """    
    We want to create a table with the detailed information about %s.
    %s.
    List all %s entities for the table. 
    The response will be formatted as JSON list shown below.
    
    RESPONSE FORMAT:
    [{
        %s
    }]
    """ 
    
    ROW_TEMPLATE = """    
    We want to create a table with the detailed information about %s.
    Columns in the table are %s.
    %s.     
    Retrieve a single row whose key is %s.
    The response will be formatted as JSON dictionary shown below.
    Pay special attention to wrap all property names and values in double quotes!
    
    RESPONSE FORMAT:
    {
        %s
    }
    """ 
    
    def _norm_field(self, s):
        s = s.lower().replace(" ","_").replace("-","_").replace(".", "").replace(",","_")\
                .replace("(", "").replace(")", "").replace(":", "").replace('"','').replace("'","")\
                .replace("/", "")
        return re.sub('_+', '_', s)
    
    def _key_columns(self, keys):
        if len(keys) == 1:
            return "The key column in the table is %s" % keys[0]
        else:
            return "The key columns in the table are %s" % ", ".join(keys)
    
    def generate_keys_prompt(self, query, keys):  
        keys_json = []
        keys = [self._norm_field(k) for k in keys]
        key_columns = self._key_columns(keys)
        for key in keys:
            keys_json.append('"%s": "%s"' % (key, key))
        response_format = ', '.join(keys_json)
        prompt = self.KEYS_TEMPLATE % (query, key_columns, ", ".join(keys), response_format)        
        return prompt
    
    def parse_keys_response(self, response, keys): 
        try:
            if not response.startswith("[") and "[" in response:
                response = response[response.find("["):]

            if not response.endswith("]") and "]" in response:
                response = response[:response.rfind("]")+1]

            if '[' not in response and ']' not in response and '{' in response and '}' in response:
                response = '[' + response + ']'    

            response_json = json.loads(response)

            if isinstance(response_json, dict) and len(response_json.keys()) == 1:
                response_json = list(response_json.values())[0]    
        except:  
            split_response = response.split("{")
            response_json = []
            for s in split_response[1:]:
                split_s = s.split("}")
                if len(split_s) > 1:
                    content = split_s[0]
                    attributes = content.split(",")
                    elements = {}
                    for attr in attributes:
                        knv = attr.split(":")   
                        if len(knv) > 1:
                            parsed_k = "%s" % knv[0].replace('"','').strip()
                            parsed_v = "%s" % knv[1].replace('"','').strip()
                            elements[parsed_k] = parsed_v

                    response_json.append(elements)
        
        norm_keys = [self._norm_field(key) for key in keys]
        keys_json = []
        for item in response_json:
            key_item = {}
            for key in norm_keys:
                key_item[key] = item[key] if isinstance(item, dict) else item
            keys_json.append(key_item)    
        
        return keys_json
    
    def generate_row_prompt(self, query, keys, fields, fetched_key):
        keys = [self._norm_field(k) for k in keys]
        key_columns = self._key_columns(keys)    
        
        fields = [self._norm_field(f) for f in fields]
        all_columns = ", ".join(fields)

        key_json = []
        fields_json = []        
        for field in fields:
            if field in fetched_key:
                key_value = fetched_key[field]
                key_json.append("%s = %s" % (field, key_value))
            field_value = fetched_key.get(field, field)
            fields_json.append('"%s": "%s"' % (field, field_value))
        
        row_key = '(%s)' % ', '.join(key_json)
        response_format = ', '.join(fields_json)
        prompt = self.ROW_TEMPLATE % (query, all_columns, key_columns, row_key, response_format)        
        return prompt 
    
    def parse_row_response(self, response): 
        if not response.endswith("}") and "}" in response:
            response = response[:response.find("}")+1]
        
        if not response.startswith("{") and "{" in response:
            response = response[response.rfind("{"):]        

        response_json = json.loads(response)
        return response_json
    
    def create_dataframe(self, rows, columns, keys, df_ref): 
        df = pd.DataFrame.from_dict(rows)  
        columns = [self._norm_field(col) for col in columns]
        df = df[columns]
        df.columns = df_ref.columns
        df = df.drop_duplicates(subset=keys)
        return df

## Experiment Runner

In [3]:
class ExperimentRunner():
    openai.api_key = ""
    MODEL = "gpt-4o"
    NOTE = 'generic_keys_rows'
    
    def __init__(self, table_generator, metadata_path):
        with open(metadata_path, "rb") as f:
            self.metadata = json.load(f)
            
        self.table_generator = table_generator
        
        self.result_folder = "DATA/%s_%s_%s" % (self.MODEL.replace('-', '_'), 
                                                   self.NOTE,
                                                   time.strftime("%Y%m%d-%H%M%S"))
        
        print("Experiment result folder: %s" % self.result_folder)
        
        os.makedirs(self.result_folder)
        os.makedirs("%s/Tables" % self.result_folder)
        
        self.result = {}
        
    def fetch_data(self, idx):
        task = self.metadata[idx]
        
        task_name = task['name']        
        print("Fetching data for %s" % task_name)
        
        query = task['table_title']
        keys = task['keys']
        columns = task['columns'] 
        
        keys_prompt = self.table_generator.generate_keys_prompt(query, keys)
        self.result[idx] = {'keys_prompt': keys_prompt} 
        
        try:
            result = openai.ChatCompletion.create(
                model=self.MODEL,
                messages=[{"role": "system", "content": self.table_generator.SYSTEM_MSG},
                {"role": "user", "content": keys_prompt}],
                temperature=0)

            keys_response = result["choices"][0]["message"]["content"].strip()
            self.result[idx]['keys_response'] = [keys_response]    

            parsed_keys_response = self.table_generator.parse_keys_response(keys_response, keys)
            
            print("Fetched %d key instances" % len(parsed_keys_response))                       
            
            self.result[idx]['row_prompts'] = []
            self.result[idx]['row_responses'] = []
            rows = []

            for key_instance in parsed_keys_response:
                try:                    
                    row_prompt_i = self.table_generator.generate_row_prompt(query, keys, columns, key_instance)
                    self.result[idx]['row_prompts'].append(row_prompt_i)
                   
                    result = openai.ChatCompletion.create(
                        model=self.MODEL,
                        messages=[{"role": "system", "content": self.table_generator.SYSTEM_MSG},
                        {"role": "user", "content": row_prompt_i}],
                        temperature=0)

                    row_response = result["choices"][0]["message"]["content"].strip()
                    self.result[idx]['row_responses'].append(row_response)

                    parsed_row_response = self.table_generator.parse_row_response(row_response)
                    rows.append(parsed_row_response)
                except Exception as ie:
                    print(ie.__class__.__name__)
                    rows_json = []
                    fields = [self.table_generator._norm_field(col) for col in columns]
                    for field in fields:
                        value = key_instance.get(field, "failed")
                        rows_json.append('"%s": "%s"' % (field, value))
                    failed_row = "{%s}" % ', '.join(rows_json)
                    rows.append(json.loads(failed_row))
        
            df_ref = pd.read_csv(task['path'])
            df = self.table_generator.create_dataframe(rows, columns, keys, df_ref) 

            table_path = "%s/Tables/%s.csv" % (self.result_folder, task_name)
            self.result[idx]['table_path'] = table_path                
            df.to_csv(table_path, index=False)            

            print("Created table with %d rows" % len(df))

            return df
        except Exception as e:  
            print(e.__class__.__name__)
            
    def save_result(self):
        with open("%s/result.json" % self.result_folder, "w") as outfile:
            result_json = json.dumps(self.result, indent=4)
            outfile.write(result_json)

## Test

In [4]:
tg = TableGenerator_JSON()

runner = ExperimentRunner(tg, metadata_path="DATA/Benchmark/cfg.json")

print("\n====================\n")

for i in range(100):
    print("Table # %d" % (i+1))
    idx = "%d" % i
    table = runner.fetch_data(idx)
    print("\n====================\n")
    
runner.save_result()   

Experiment result folder: DATA/gpt_4o_generic_keys_rows_20240728-073304


Table # 1
Fetching data for republican_straw_polls_2012
Fetched 10 key instances
Created table with 10 rows


Table # 2
Fetching data for russia_demographics_1946_2012
Fetched 67 key instances
Created table with 67 rows


Table # 3
Fetching data for belgium_demographics_1900_2011
Fetched 112 key instances
Created table with 112 rows


Table # 4
Fetching data for australia_demographics_1900_2010
Fetched 111 key instances
Created table with 111 rows


Table # 5
Fetching data for new_brunswick_parishes_2006_2011
Fetched 121 key instances
Created table with 119 rows


Table # 6
Fetching data for ice_hockey_2006
Fetched 20 key instances
Created table with 20 rows


Table # 7
Fetching data for biathlon_sprint_standings_2009_10
Fetched 20 key instances
Created table with 20 rows


Table # 8
Fetching data for anaheim_ducks_draft_picks_1998_2013
Fetched 28 key instances
Created table with 28 rows


Table # 9
Fetching data

Fetched 14 key instances
Created table with 14 rows


Table # 62
Fetching data for oxford_university_undergraduate_admissions_1988_2010
Fetched 23 key instances
Created table with 23 rows


Table # 63
Fetching data for serbia_demographics_1900_1912
Fetched 13 key instances
Created table with 13 rows


Table # 64
Fetching data for protein_data_bank_1976_2012
Fetched 37 key instances
Created table with 37 rows


Table # 65
Fetching data for european_athletics_championships_1986
Fetched 22 key instances
Created table with 22 rows


Table # 66
Fetching data for mongolia_provinces_population_79_89_00_09
Fetched 21 key instances
Created table with 21 rows


Table # 67
Fetching data for tulsa_shock_2010
Fetched 18 key instances
Created table with 18 rows


Table # 68
Fetching data for london_heathrow_busiest_routes_2012
Fetched 59 key instances
Created table with 59 rows


Table # 69
Fetching data for hungarian_grand_prix_qualifying_2012
Fetched 24 key instances
Created table with 24 rows


T