## Imports

In [1]:
import openai
import tiktoken

import pandas as pd
import json

from dateutil import parser as date_parser
from unidecode import unidecode

import os
import time
import re

## Table Generator

In [2]:
class TableGenerator_JSON():
    TEMPLATE = """
    [INST] <<SYS>>
    You are a retriever of facts.
    <</SYS>>  
    
    List %s - as many as possible to fit into response.
    The response will be formatted as JSON shown below.
    Each element of the response will contain %d fields: %s.
    Do not output any additional text that is not in JSON format.
    
    RESPONSE FORMAT:
    [{
        %s
    }]

    [/INST]
    """  
    
    def _norm_field(self, s):
        s = s.lower().replace(" ","_").replace("-","_").replace(".", "").replace(",","_")\
                .replace("(", "").replace(")", "").replace(":", "").replace('"','').replace("'","")\
                .replace("/", "")
        return re.sub('_+', '_', s)
        
    def generate_prompt(self, query, fields):
        num_fields = len(fields)
        fields_json = []
        fields = [self._norm_field(f) for f in fields]
        for field in fields:
            fields_json.append('"%s": "%s"' % ('_'.join(field.replace("-", " ").split()), field))
        response_format = ', '.join(fields_json)
        prompt = self.TEMPLATE % (query, num_fields, fields, response_format)
        return prompt       
    
    def parse_llm_response(self, response): 
        res = []
        try:
            if not response.startswith("[") and "[" in response:
                response = response[response.find("["):]

            if not response.endswith("]") and "]" in response:
                response = response[:response.rfind("]")+1]

            if '[' not in response and ']' not in response and '{' in response and '}' in response:
                response = '[' + response + ']'    

            response_json = json.loads(response)

            if isinstance(response_json, dict) and len(response_json.keys()) == 1:
                response_json = list(response_json.values())[0]    
        except:  
            split_response = response.split("{")
            response_json = []
            for s in split_response[1:]:
                split_s = s.split("}")
                if len(split_s) > 1:
                    content = split_s[0]
                    attributes = content.split(",")
                    elements = {}
                    for attr in attributes:
                        knv = attr.split(":")   
                        if len(knv) > 1:
                            parsed_k = "%s" % knv[0].replace('"','').strip()
                            parsed_v = "%s" % knv[1].replace('"','').strip()
                            elements[parsed_k] = parsed_v

                    response_json.append(elements)  

        df = pd.DataFrame.from_records(response_json) 
        return df

## Experiment Runner

In [3]:
class ExperimentRunner():
    openai.api_key = ""
    openai.api_base = "https://api.deepinfra.com/v1/openai"
    MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
    NOTE = 'full_table'
    MAX_LEN = 3900
    
    def __init__(self, table_generator, metadata_path):
        with open(metadata_path, "rb") as f:
            self.metadata = json.load(f)
            
        self.table_generator = table_generator
        self.encoding = tiktoken.get_encoding("cl100k_base")
        
        self.result_folder = "DATA/%s_%s_%s" % (self.MODEL.split("/")[-1].replace('-', '_'), 
                                                   self.NOTE,
                                                   time.strftime("%Y%m%d-%H%M%S"))
        
        print("Experiment result folder: %s" % self.result_folder)
        
        os.makedirs(self.result_folder)
        os.makedirs("%s/Tables" % self.result_folder)
        
        self.result = {}
        
    def fetch_data(self, idx):
        task = self.metadata[idx]
        
        task_name = task['name']        
        print("Fetching data for %s" % task_name)
        
        query, columns = task['table_title'], task['columns']            
        print("Query: ", query) 
        
        prompt = self.table_generator.generate_prompt(query, columns)        

        self.result[idx] = {'prompt': prompt}
        
        try:
            max_tokens = self.MAX_LEN - len(self.encoding.encode(prompt))
            result = openai.ChatCompletion.create(
                model=self.MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=max_tokens
            )
   
            response = result["choices"][0]["message"]["content"].strip()

            if 'response' in self.result[idx]:
                self.result[idx]['response'].append(response)
            else:    
                self.result[idx]['response'] = [response]

            df = self.table_generator.parse_llm_response(response)          
            df_ref = pd.read_csv(task['path'])          
            df.columns = df_ref.columns
            df = df.drop_duplicates(subset=task['keys'])

            table_path = "%s/Tables/%s.csv" % (self.result_folder, task_name)
            self.result[idx]['table_path'] = table_path                
            df.to_csv(table_path, index=False)

            print("Created table with %d rows" % len(df))

            return df
        except Exception as e:              
            print(e.__class__.__name__)
            print(e)
    
    def save_result(self):
        with open("%s/result.json" % self.result_folder, "w") as outfile:
            result_json = json.dumps(self.result, indent=4)
            outfile.write(result_json)

## Test

In [4]:
tg = TableGenerator_JSON()

runner = ExperimentRunner(tg, metadata_path="DATA/Benchmark/cfg.json")

print("\n====================\n")

for i in range(100):
    print("Table # %d" % (i+1))
    idx = "%d" % i
    table = runner.fetch_data(idx)
    print("\n====================\n")
    
runner.save_result()

Experiment result folder: DATA/Meta_Llama_3.1_70B_Instruct_full_table_20240726-231535


Table # 1
Fetching data for republican_straw_polls_2012
Query:  results of straw polls for the Republican Party presidential primaries, 2012
Created table with 32 rows


Table # 2
Fetching data for russia_demographics_1946_2012
Query:  vital statistics of Russia's demographics from 1946 to 2012
Created table with 15 rows


Table # 3
Fetching data for belgium_demographics_1900_2011
Query:  vital statistics of Belgium's demographics from 1900 to 2011
Created table with 13 rows


Table # 4
Fetching data for australia_demographics_1900_2010
Query:  vital statistics of Australia's demographics from 1900 to 2010
Created table with 23 rows


Table # 5
Fetching data for new_brunswick_parishes_2006_2011
Query:  population statistics of the parishes in New Brunswick
Created table with 56 rows


Table # 6
Fetching data for ice_hockey_2006
Query:  statistics for men's ice hockey at the 2006 Winter Olympics
Crea

Created table with 19 rows


Table # 48
Fetching data for european_countries_gdp_2007_2012
Query:  sovereign states in Europe by GDP (nominal)
Created table with 48 rows


Table # 49
Fetching data for royal_dulton_figurines_HN4100_HN4199
Query:  Royal Dulton figurines from HN4100 to HN4199
Created table with 83 rows


Table # 50
Fetching data for adaalat_episodes_2012
Query:  Adaalat episodes from 1 January 2012 to 31 December 2012
Created table with 10 rows


Table # 51
Fetching data for viktoria_plzen_1993_2012
Query:  stats of FC Viktoria Plzen from 1993-1994 to 2012-2013
Created table with 20 rows


Table # 52
Fetching data for just_dance_kids_2_tracks
Query:  tracks of Just Dance Kids 2
Created table with 34 rows


Table # 53
Fetching data for cross_country_junior_women_1996
Query:  results of 1996 IAAF World Cross Country Championships in Junior women's race
Created table with 20 rows


Table # 54
Fetching data for metropolitan_opera_us_premieres
Query:  United States premieres a

Created table with 13 rows


Table # 96
Fetching data for bifa_british_independent_film_2010_2012
Query:  winners and nominees of BIFA Award for Best British Independent Film from 2010 to 2012
Created table with 15 rows


Table # 97
Fetching data for through_the_wormhole_season_4
Query:  episodes of Through the Wormhole season 4
Created table with 10 rows


Table # 98
Fetching data for un_habitat_scroll_of_honour_award_1991
Query:  winners of UN-Habitat Scroll of Honour Award in 1991
Created table with 7 rows


Table # 99
Fetching data for miss_universe_semifinal_scores_1993
Query:  semifinal scores of Miss Universe 1993
Created table with 6 rows


Table # 100
Fetching data for woodley_season_1_2012
Query:  episodes of Woodley season 1
Created table with 8 rows


