## Imports

In [1]:
import openai
import tiktoken

import pandas as pd
import json

from dateutil import parser as date_parser
from unidecode import unidecode

import os
import time
import re

## Table Generator

In [2]:
class TableGenerator_JSON():    
    ROW_TEMPLATE = """
    You are a retriever of facts.
    We want to create a table with the detailed information about %s.
    Columns in the table are %s.
    %s.     
    Retrieve a single row whose key is %s.
    The response will be formatted as JSON dictionary shown below.
    Pay special attention to wrap all property names and values in double quotes!
    
    RESPONSE FORMAT:
    {
        %s
    }
    """ 
    
    def _norm_field(self, s):
        s = s.lower().replace(" ","_").replace("-","_").replace(".", "").replace(",","_")\
                .replace("(", "").replace(")", "").replace(":", "").replace('"','').replace("'","")\
                .replace("/", "")
        return re.sub('_+', '_', s)
    
    def _key_columns(self, keys):
        if len(keys) == 1:
            return "The key column in the table is %s" % keys[0]
        else:
            return "The key columns in the table are %s" % ", ".join(keys)
    
    def generate_row_prompt(self, query, keys, fields, fetched_key):        
        for key in keys:
            fetched_key[self._norm_field(key)] = fetched_key.pop(key)
        
        keys = [self._norm_field(k) for k in keys]
        key_columns = self._key_columns(keys)    
        
        fields = [self._norm_field(f) for f in fields]
        all_columns = ", ".join(fields)

        key_json = []
        fields_json = []        
        for field in fields:
            if field in fetched_key:
                key_value = str(fetched_key[field]).replace('"','')
                key_json.append("%s = %s" % (field, key_value))
            field_value = fetched_key.get(field, field)
            fields_json.append('"%s": "%s"' % (field, field_value))
        
        row_key = '(%s)' % ', '.join(key_json)
        response_format = ', '.join(fields_json)
        prompt = self.ROW_TEMPLATE % (query, all_columns, key_columns, row_key, response_format)        
        return prompt 
    
    def parse_row_response(self, response):          
        if not response.startswith("{") and "{" in response:
            response = response[response.find("{"):]

        if not response.endswith("}") and "}" in response:
            response = response[:response.find("}")+1]

        response_json = json.loads(response)
        return response_json
    
    def create_dataframe(self, rows, columns, keys, df_ref): 
        df = pd.DataFrame.from_dict(rows)  
        columns = [self._norm_field(col) for col in columns]
        df = df[columns]
        df.columns = df_ref.columns
        df = df.drop_duplicates(subset=keys)
        return df

## Experiment Runner

In [3]:
class ExperimentRunner():
    openai.api_key = ""
    openai.api_base = "https://api.deepinfra.com/v1/openai"
    MODEL = "meta-llama/Meta-Llama-3.1-405B-Instruct"
    NOTE = 'row_by_row_oracle_keys'
    MAX_LEN = 3900
    
    def __init__(self, table_generator, metadata_path):
        with open(metadata_path, "rb") as f:
            self.metadata = json.load(f)
            
        self.table_generator = table_generator
        self.encoding = tiktoken.get_encoding("cl100k_base")
        
        self.result_folder = "DATA/%s_%s_%s" % (self.MODEL.split("/")[-1].replace('-', '_'), 
                                                   self.NOTE,
                                                   time.strftime("%Y%m%d-%H%M%S"))
        
        print("Experiment result folder: %s" % self.result_folder)
        
        os.makedirs(self.result_folder)
        os.makedirs("%s/tables" % self.result_folder)
        
        self.result = {}
        
    def fetch_data(self, idx):
        task = self.metadata[idx]
        
        task_name = task['name']        
        print("Fetching data for %s" % task_name)
        
        query = task['table_title']
        keys = task['keys']
        columns = task['columns'] 
            
        try:
            df_ref = pd.read_csv(task['path'])
            oracle_keys = df_ref[keys].to_dict('records')            
            print("Fetched %d oracle key instances" % len(oracle_keys))                       

            self.result[idx] = {} 
            self.result[idx]['row_prompts'] = []
            self.result[idx]['row_responses'] = []
            rows = []
            
            for key_instance in oracle_keys:
                try:                    
                    row_prompt_i = self.table_generator.generate_row_prompt(query, keys, columns, key_instance)
                    self.result[idx]['row_prompts'].append(row_prompt_i)
                    
                    max_tokens = self.MAX_LEN - len(self.encoding.encode(row_prompt_i))
                    result = openai.ChatCompletion.create(
                        model=self.MODEL,
                        messages=[{"role": "user", "content": row_prompt_i}],
                        temperature=0,
                        max_tokens=max_tokens
                    )
                    row_response = result["choices"][0]["message"]["content"].strip()
                    
                    self.result[idx]['row_responses'].append(row_response)

                    parsed_row_response = self.table_generator.parse_row_response(row_response)
                    rows.append(parsed_row_response)
                except Exception as ie:
                    print(ie.__class__.__name__)
                    rows_json = []
                    fields = [self.table_generator._norm_field(col) for col in columns]
                    for field in fields:
                        value = key_instance.get(field, "failed")
                        rows_json.append('"%s": "%s"' % (field, value))
                    failed_row = "{%s}" % ', '.join(rows_json)
                    rows.append(json.loads(failed_row))        
            
            df = self.table_generator.create_dataframe(rows, columns, keys, df_ref) 

            table_path = "%s/tables/%s.csv" % (self.result_folder, task_name)
            self.result[idx]['table_path'] = table_path                
            df.to_csv(table_path, index=False)            

            print("Created table with %d rows" % len(df))

            return df
        except Exception as e:  
            print(e.__class__.__name__)
            
    def save_result(self):
        with open("%s/result.json" % self.result_folder, "w") as outfile:
            result_json = json.dumps(self.result, indent=4)
            outfile.write(result_json)

## Test

In [4]:
 tg = TableGenerator_JSON()

runner = ExperimentRunner(tg, metadata_path="DATA/benchmark/cfg.json")

print("\n====================\n")

for i in range(66, 100):
    print("Table # %d" % (i+1))
    idx = "%d" % i
    table = runner.fetch_data(idx)
    print("\n====================\n")
    
runner.save_result()  

Experiment result folder: DATA/Meta_Llama_3.1_405B_Instruct_row_by_row_oracle_keys_20240924-053536


Table # 67
Fetching data for tulsa_shock_2010
Fetched 18 oracle key instances
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
JSONDecodeError
Created table with 18 rows


Table # 68
Fetching data for london_heathrow_busiest_routes_2012
Fetched 60 oracle key instances
Created table with 60 rows


Table # 69
Fetching data for hungarian_grand_prix_qualifying_2012
Fetched 24 oracle key instances
Created table with 24 rows


Table # 70
Fetching data for farum_park_national_games
Fetched 20 oracle key instances
Created table with 20 rows


Table # 71
Fetching data for india_poverty_2007
Fetched 21 oracle key instances
Created table with 21 rows


Table # 72
Fetching data for us_president_elections_idaho_