# prepare starcoder 

In [None]:
!pip install jsonformer

In [3]:
# fix jsonformer
def apply_generate_array_fix(jsonformer):
    def generate_array(item_schema, obj) -> list:
        for _ in range(jsonformer.max_array_length):
            # forces array to have at least one element
            element = jsonformer.generate_value(item_schema, obj)
            obj[-1] = element

            obj.append(jsonformer.generation_marker)
            input_prompt = jsonformer.get_prompt()
            obj.pop()
            input_tensor = jsonformer.tokenizer.encode(input_prompt, return_tensors="pt")
            output = jsonformer.model.forward(input_tensor.to(jsonformer.model.device))
            logits = output.logits[0, -1]


            top_indices = logits.topk(30).indices
            sorted_token_ids = top_indices[logits[top_indices].argsort(descending=True)]

            found_comma = False
            found_close_bracket = False
            for token_id in sorted_token_ids:
                decoded_token = jsonformer.tokenizer.decode(token_id)
                if '{' in decoded_token:
                    found_comma = True
                    break
                if ']' in decoded_token:
                    found_close_bracket = True
                    break

            if found_close_bracket or not found_comma:
                break

        return obj
    
    def get_prompt():
        template = """{prompt}{progress}"""
        progress = json.dumps(jsonformer.value)
        gen_marker_index = progress.find(f'"{jsonformer.generation_marker}"')
        if gen_marker_index != -1:
            progress = progress[:gen_marker_index]
        else:
            raise ValueError("Failed to find generation marker")

        prompt = template.format(
            prompt=jsonformer.prompt,
            #schema=json.dumps(jsonformer.json_schema),
            progress=progress,
        )

        return prompt
    jsonformer.get_prompt = get_prompt 
    jsonformer.generate_array = generate_array
    return jsonformer

In [None]:
import argparse
import torch
from dialogues import DialogueTemplate, get_dialogue_template
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          GenerationConfig, set_seed)
from jsonformer import Jsonformer
from peft import PeftModel, PeftConfig


def initialize_model(model_id, tokenizer_model_id=None):
    if not tokenizer_model_id:
        tokenizer_model_id = model_id
    revision = None
    system_prompt = None
    set_seed(42)

    try:
        dialogue_template = DialogueTemplate.from_pretrained(model_id, revision=revision)
    except Exception:
        print("No dialogue template found in model repo. Defaulting to the `no_system` template.")
        dialogue_template = get_dialogue_template("no_system")

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_id, revision=revision)
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(dialogue_template.end_token)
    tokenizer.eos_token_id = tokenizer.eos_token_id
    generation_config = GenerationConfig(
        temperature=float(0.01),
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.convert_tokens_to_ids(dialogue_template.end_token),
        min_new_tokens=32,
        max_new_tokens=512,
    )
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(
        model_id, revision=revision, load_in_8bit=True, device_map="auto", torch_dtype=torch.float16
    )
    #config = PeftConfig.from_pretrained(model_id)
    #model = PeftModel.from_pretrained(model, config)

    
    return model, tokenizer, dialogue_template

In [5]:
import os
import json

def generate_response_as_json_starcoder(prompt_text):
    prompt =[
        {
            "role": "user",
            "content": prompt_text,
        }
    ]
    dialogue_template.messages = [prompt] if isinstance(prompt, dict) else prompt
    formatted_prompt = dialogue_template.get_inference_prompt()

    src_table, src_cols, trg_table, trg_cols = get_enums_from_prompt_text(prompt_text)
    
    json_schema_fk = {
            "type": "object",
            "properties": {
                      "table": { 
                          "type": "enum",
                          "values": [src_table]
                      },
                      "column": { 
                        "type": "enum",
                        "values": src_cols
                      },
                      "referencedTable": { 
                          "type": "enum",
                          "values": [trg_table]
                      },
                      "referencedColumn": { 
                        "type": "enum",
                        "values": trg_cols
                      },
                    },
            "required": ["table", "column", "referencedTable", "referencedColumn"],
    }
    

    jsonformer = Jsonformer(model, tokenizer, json_schema_fk, formatted_prompt, debug = False)
    apply_generate_array_fix(jsonformer)
    result_json = jsonformer()
    
    return result_json

# prepare openai

In [4]:
import openai

from tenacity import retry, wait_random_exponential, stop_after_attempt
@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
def generate_response_as_json_gpt35(prompt_text):
        src_table, src_cols, trg_table, trg_cols = get_enums_from_prompt_text(prompt_text)

        functions = [
        {
            "name": "get_foreign_key",
            "description": "Get foreign keys",
            "parameters": {
                "type": "object",
                "properties": {
                    "table": { 
                        "type": "string" ,
                        "values": [src_table]
                      },
                      "column": { 
                        "type": "string",
                        "values": src_cols
                      },
                      "referencedTable": { 
                        "type": "string", 
                        "values": [trg_table]
                      },
                      "referencedColumn": { 
                        "type": "string",
                        "values": trg_cols
                      },
                },
                "required": ["table", "column", "referencedTable", "referencedColumn"],
            },
        }]
        foreign_keys_llm = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that always returns responses in JSON without any additional explanations. Only Respond with the desired JSON, NOTHING else, no explanations."},
                {"role": "user", "content": prompt_text},
            ],
            functions = functions,
            temperature=0,
        )["choices"][0]["message"]
        
        return json.loads(foreign_keys_llm["function_call"]["arguments"])

# prepare t5

In [18]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,  Seq2SeqTrainingArguments, Seq2SeqTrainer

model_name = "t5-base-schemapile"
model_dir = f"../../data/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
max_input_length = 1024

def generate_responses_as_json_t5(prompt):
    inputs = [prompt]
    inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
    output = model.generate(**inputs, num_beams=8, do_sample=False, min_length=10, max_length=512)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    try:
        output_json = json.loads('{'+decoded_output+"}")
    except Exception as e:
        print('{'+decoded_output+"}")
        raise e
    return output_json

# load ground truth with additioal fk-filter 

In [None]:
import json
from datasets import load_dataset
dataset = load_dataset('../../data/data/foreign_keys_instruction_data_schemapile')
ds = dataset['train']['messages']
fks_schemapile = set([d[1]['content'].lower() for d in ds])
json.dump(list(fks_schemapile), open('fks_schemapile.json','w+'))

In [25]:
fks_schemapile = set(json.load(open('fks_schemapile.json')))

In [26]:
def clean_dataset_from_fk_pairs(prompts_ground_truth):
    prompts_ground_truth_clean = {}
    excluded = 0
    for dataset in prompts_ground_truth:
        contained = prompts_ground_truth[dataset]['foreign_key'].lower()  in fks_schemapile
        if contained:
            excluded += 1
            print(dataset)
        else:
            prompts_ground_truth_clean[dataset] = prompts_ground_truth[dataset]
            
    print("Excluded: "+str(excluded)+"/"+str(len(prompts_ground_truth)))
    return prompts_ground_truth_clean

In [None]:
import json
import os
prompts_ground_truth_spider = clean_dataset_from_fk_pairs(json.load(open(f"prompts_ground_truth_spider.json")))
prompts_ground_truth_bird = clean_dataset_from_fk_pairs(json.load(open(f"prompts_ground_truth_bird.json")))
prompts_ground_truth_ctu = clean_dataset_from_fk_pairs(json.load(open(f"prompts_ground_truth_ctu.json")))

In [None]:
ctu_subsets = ['tpcc', 'tpce', 'tpcd']
prompts_ground_truth_ctu_tpc_subset = {s: {} for s in ctu_subsets}
for p in prompts_ground_truth_ctu:
    for s in ctu_subsets:
        if p.lower().startswith(s):
            prompts_ground_truth_ctu_tpc_subset[s][p] = prompts_ground_truth_ctu[p]

# generate predictions

In [11]:
def get_enums_from_prompt_text(prompt_text):
    src = prompt_text.split("\n")[1]
    trg = prompt_text.split("\n")[2]
    src_table = src[:src.find("(")]
    src_cols = src[src.find("(")+1:src.find(")")].split(", ")
    trg_table = trg[:trg.find("(")]
    trg_cols = trg[trg.find("(")+1:trg.find(")")].split(", ")
    return src_table, src_cols, trg_table, trg_cols

def generate_responses(prompts_ground_truth, method, output_file, recompute=False):
    responses = {}
    if os.path.exists(output_file) and not recompute:
        responses = json.load(open(output_file))
    
    generate_response_as_json = method
    for dataset_name in prompts_ground_truth:
        if dataset_name in responses:
            continue
            
        print(f"determining fk's for: {dataset_name}")
        
        prompt = prompts_ground_truth[dataset_name]["prompt"]
        response_json = None
        try:
            response_json = generate_response_as_json(prompt)
        except Exception as e:
            print(f"error with prompt {dataset_name}: {str(e)}")
        responses[dataset_name] = response_json
        
        with open(output_file, "w+") as f:
            json.dump(responses, f)
    return responses 

## generate predictions t5

In [None]:
responses_t5_schemapile_spider = generate_responses(prompts_ground_truth_spider, 
                               generate_responses_as_json_t5, "responses_t5_schemapile_spider.json")

In [None]:
responses_t5_schemapile_bird = generate_responses(prompts_ground_truth_bird, 
                               generate_responses_as_json_t5, "responses_t5_schemapile_bird.json")

In [None]:
responses_t5_schemapile_ctu = generate_responses(prompts_ground_truth_ctu, 
                               generate_responses_as_json_t5, "responses_t5_schemapile_ctu.json")

## generate predictions starcoder-ctu

In [None]:
model, tokenizer, dialogue_template = initialize_model("../../data/starcoder-ctu")

In [15]:
responses_starcoder_ctu_spider = generate_responses(prompts_ground_truth_spider,
                               generate_response_as_json_starcoder, "responses_starcoder_ctu_spider.json")

## generate predictions starcoder-ctu-peft

In [None]:
model, tokenizer, dialogue_template = initialize_model("../../data/starcoder-ctu_peft")

In [14]:
responses_starcoder_ctu_peft_spider = generate_responses(prompts_ground_truth_spider,
                               generate_response_as_json_starcoder, "responses_starcoder_ctu_spider.json")

## generate predictions starcoder-schemapile

In [None]:
model, tokenizer, dialogue_template = initialize_model("../../data/starcoder-schemapile")

In [22]:
responses_starcoder_schemapile_spider = generate_responses(prompts_ground_truth_spider,
                               generate_response_as_json_starcoder, "responses_starcoder_schemapile_spider.json")

In [23]:
responses_starcoder_schemapile_bird = generate_responses(prompts_ground_truth_bird,
                               generate_response_as_json_starcoder, "responses_starcoder_schemapile_bird.json")

In [24]:
responses_starcoder_schemapile_ctu = generate_responses(prompts_ground_truth_ctu,
                               generate_response_as_json_starcoder, "responses_starcoder_schemapile_ctu.json")

## generate predictions starcoder alpha

In [None]:
model, tokenizer, dialogue_template = initialize_model("HuggingFaceH4/starchat-alpha")

In [25]:
responses_starcoder_alpha_spider = generate_responses(prompts_ground_truth_spider, 
                               generate_response_as_json_starcoder, "responses_starcoder_alpha_spider.json")

In [26]:
responses_starcoder_alpha_bird = generate_responses(prompts_ground_truth_bird, 
                               generate_response_as_json_starcoder, "responses_starcoder_alpha_bird.json")

In [None]:
responses_starcoder_alpha_ctu = generate_responses(prompts_ground_truth_ctu, 
                               generate_response_as_json_starcoder, "responses_starcoder_alpha_ctu.json")

## generate predictions gpt3.5

In [None]:
responses_gpt35_spider = generate_responses(prompts_ground_truth_spider, 
                   generate_response_as_json_gpt35, "responses_gpt35_spider.json")

In [None]:
responses_gpt35_bird = generate_responses(prompts_ground_truth_bird, 
                   generate_response_as_json_gpt35, "responses_gpt35_bird.json")

In [None]:
responses_gpt35_ctu = generate_responses(prompts_ground_truth_ctu, 
                   generate_response_as_json_gpt35, "responses_gpt35_ctu.json")

# evaluation

In [29]:
def evaluate_responses(responses, prompts_ground_truth):
    error = 0
    match = 0
    no_match = 0

    for dataset_name in responses:
        if dataset_name not in prompts_ground_truth:
            print(f"dataset {dataset_name} not found in ground truth, skipping")
            continue
        
        foreign_key_ground_truth = json.loads(prompts_ground_truth[dataset_name]["foreign_key"])
        foreign_key_prediction = responses[dataset_name]
        
        if foreign_key_prediction is None:
            error += 1
            continue
            
        if (list(foreign_key_prediction.keys()) == list(foreign_key_ground_truth.keys()) and
            (list(foreign_key_prediction.values()) == list(foreign_key_ground_truth.values()))):
            match += 1
        else:
            no_match += 1  

    print("match: "+str(match))
    print("no match: "+str(no_match))
    print("error: "+str(error))
    print("success rate: "+str(match/(match+no_match+error)))

In [None]:
evaluate_responses(responses_t5_schemapile_spider, prompts_ground_truth_spider)

In [None]:
evaluate_responses(responses_t5_schemapile_bird, prompts_ground_truth_bird)

In [None]:
evaluate_responses(responses_t5_schemapile_ctu, prompts_ground_truth_ctu)

In [None]:
evaluate_responses(responses_starcoder_ctu_peft_spider, prompts_ground_truth_spider)

In [None]:
evaluate_responses(responses_starcoder_ctu_spider, prompts_ground_truth_spider)

In [None]:
evaluate_responses(responses_gpt35_spider, prompts_ground_truth_spider)

In [None]:
evaluate_responses(responses_gpt35_bird, prompts_ground_truth_bird)

In [None]:
evaluate_responses(responses_gpt35_ctu, prompts_ground_truth_ctu)

In [None]:
evaluate_responses(responses_starcoder_alpha_spider, prompts_ground_truth_spider)

In [None]:
evaluate_responses(responses_starcoder_alpha_bird, prompts_ground_truth_bird)

In [None]:
evaluate_responses(responses_starcoder_alpha_ctu, prompts_ground_truth_ctu)

In [None]:
evaluate_responses(responses_starcoder_schemapile_spider, prompts_ground_truth_spider)

In [None]:
evaluate_responses(responses_starcoder_schemapile_bird, prompts_ground_truth_bird)

In [None]:
evaluate_responses(responses_starcoder_schemapile_ctu, prompts_ground_truth_ctu)

In [None]:
responses = {}
for s in ctu_subsets:
    match, no_match, error, rate = evaluate_responses(responses_starcoder_schemapile_ctu, prompts_ground_truth_ctu_tpc_subset[s])
    responses[s] = (match, no_match, error, rate)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# Define the models and group data
models = ['jaccard', 'gpt-3.5', 'starcoder-alpha', 't5-schemapile', 'starcoder-schemapile']
groups = ['spider', 'bird', 'ctu']
group_data = np.array([[0.58, 0.88, 0.64, 0.92, 0.97], [0.63, 0.82,0.61, 0.83, 0.94], [0.36, 0.86, 0.82, 0.91, 0.97]])

# Define a list of colors
colors = sns.color_palette('deep')[0:5]

barWidth = 0.15

# Create bars
for i in range(len(models)):  # for each model
    r = [j + barWidth*i for j in range(len(groups))]
    plt.bar(r, group_data[:, i], width=barWidth, color=colors[i], edgecolor='grey', label=models[i])

# Adding xticks
plt.xlabel('Groups', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(groups))], groups)

# Add ylabel
plt.ylabel('recall@k=1')

# Add a legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show the plot
plt.show()