In [None]:
# prepare starcoder 

In [None]:
!pip install jsonformer

In [1]:
# patch jsonformer
def apply_generate_array_fix(jsonformer):
    def generate_array(item_schema, obj) -> list:
        for _ in range(jsonformer.max_array_length):
            # forces array to have at least one element
            element = jsonformer.generate_value(item_schema, obj)
            obj[-1] = element

            obj.append(jsonformer.generation_marker)
            input_prompt = jsonformer.get_prompt()
            obj.pop()
            input_tensor = jsonformer.tokenizer.encode(input_prompt, return_tensors="pt")
            output = jsonformer.model.forward(input_tensor.to(jsonformer.model.device))
            logits = output.logits[0, -1]


            top_indices = logits.topk(30).indices
            sorted_token_ids = top_indices[logits[top_indices].argsort(descending=True)]

            found_comma = False
            found_close_bracket = False
            for token_id in sorted_token_ids:
                decoded_token = jsonformer.tokenizer.decode(token_id)
                if '{' in decoded_token:
                    found_comma = True
                    break
                if ']' in decoded_token:
                    found_close_bracket = True
                    break

            if found_close_bracket or not found_comma:
                break

        return obj
    
    def get_prompt():
        template = """{prompt}{progress}"""
        progress = json.dumps(jsonformer.value)
        gen_marker_index = progress.find(f'"{jsonformer.generation_marker}"')
        if gen_marker_index != -1:
            progress = progress[:gen_marker_index]
        else:
            raise ValueError("Failed to find generation marker")

        prompt = template.format(
            prompt=jsonformer.prompt,
            #schema=json.dumps(jsonformer.json_schema),
            progress=progress,
        )

        return prompt
    jsonformer.get_prompt = get_prompt 
    jsonformer.generate_array = generate_array
    return jsonformer

In [2]:
import argparse
import torch
from starcoder_finetune.dialogues import DialogueTemplate, get_dialogue_template
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          GenerationConfig, set_seed)
from jsonformer import Jsonformer

def initialize_model(model_id):
    revision = None
    system_prompt = None
    set_seed(42)
    
    try:
        dialogue_template = DialogueTemplate.from_pretrained(model_id, revision=revision)
    except Exception:
        print("No dialogue template found in model repo. Defaulting to the `no_system` template.")
        dialogue_template = get_dialogue_template("no_system")

    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(dialogue_template.end_token)
    tokenizer.eos_token_id = tokenizer.eos_token_id
    generation_config = GenerationConfig(
        temperature=float(0.01),
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.convert_tokens_to_ids(dialogue_template.end_token),
        min_new_tokens=32,
        max_new_tokens=512,
    )
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(
        model_id, revision=revision, load_in_8bit=True, device_map="auto", torch_dtype=torch.float16
    )
    
    return model, tokenizer, dialogue_template

2023-08-28 14:51:14.437471: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-28 14:51:14.724011: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
def generate_response_as_json_starcoder(prompt_text):
    prompt =[
        {
            "role": "user",
            "content": prompt_text,
        }
    ]
    dialogue_template.messages = [prompt] if isinstance(prompt, dict) else prompt
    formatted_prompt = dialogue_template.get_inference_prompt()

    src_table, src_cols, trg_table, trg_cols = get_enums_from_prompt_text(prompt_text)
    
    json_schema_fk = {
            "type": "object",
            "properties": {
                      "table": { 
                          "type": "enum",
                          "values": [src_table]
                      },
                      "column": { 
                        "type": "enum",
                        "values": src_cols
                      },
                      "referencedTable": { 
                          "type": "enum",
                          "values": [trg_table]
                      },
                      "referencedColumn": { 
                        "type": "enum",
                        "values": trg_cols
                      },
                    },
            "required": ["table", "column", "referencedTable", "referencedColumn"],
    }
    

    jsonformer = Jsonformer(model, tokenizer, json_schema_fk, formatted_prompt, debug = False)
    apply_generate_array_fix(jsonformer)
    result_json = jsonformer()
    
    return result_json

In [None]:
# prepare openai

In [4]:
import openai

openai.api_key = "sk-..."

from tenacity import retry, wait_random_exponential, stop_after_attempt
@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
def generate_response_as_json_gpt35(prompt_text):
        src_table, src_cols, trg_table, trg_cols = get_enums_from_prompt_text(prompt_text)

        functions = [
        {
            "name": "get_foreign_key",
            "description": "Get foreign keys",
            "parameters": {
                "type": "object",
                "properties": {
                    "table": { 
                        "type": "string" ,
                        "values": [src_table]
                      },
                      "column": { 
                        "type": "string",
                        "values": src_cols
                      },
                      "referencedTable": { 
                        "type": "string", 
                        "values": [trg_table]
                      },
                      "referencedColumn": { 
                        "type": "string",
                        "values": trg_cols
                      },
                },
                "required": ["table", "column", "referencedTable", "referencedColumn"],
            },
        }]
        foreign_keys_llm = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that always returns responses in JSON without any additional explanations. Only Respond with the desired JSON, NOTHING else, no explanations."},
                {"role": "user", "content": prompt_text},
            ],
            functions = functions,
            temperature=0,
        )["choices"][0]["message"]
        
        return json.loads(foreign_keys_llm["function_call"]["arguments"])

In [None]:
# generate predictions

In [5]:
import json
import os
prompts_ground_truth_spider = json.load(open(f"prompts_ground_truth_spider.json"))
prompts_ground_truth_bird = json.load(open(f"prompts_ground_truth_bird.json"))
prompts_ground_truth_ctu = json.load(open(f"prompts_ground_truth_ctu.json"))

In [6]:
def get_enums_from_prompt_text(prompt_text):
    src = prompt_text.split("\n")[1]
    trg = prompt_text.split("\n")[2]
    src_table = src[:src.find("(")]
    src_cols = src[src.find("(")+1:src.find(")")].split(", ")
    trg_table = trg[:trg.find("(")]
    trg_cols = trg[trg.find("(")+1:trg.find(")")].split(", ")
    return src_table, src_cols, trg_table, trg_cols

def generate_responses(prompts_ground_truth, method, output_file, recompute=False):
    responses = {}
    if os.path.exists(output_file) and not recompute:
        responses = json.load(open(output_file))
    
    generate_response_as_json = method
    for dataset_name in prompts_ground_truth:
        if dataset_name in responses:
            continue
            
        print(f"determining fk's for: {dataset_name}")
        
        prompt = prompts_ground_truth[dataset_name]["prompt"]
        response_json = None
        try:
            response_json = generate_response_as_json(prompt)
        except Exception as e:
            print(f"error with prompt {dataset_name}: {str(e)}")
        responses[dataset_name] = response_json
        
        with open(output_file, "w+") as f:
            json.dump(responses, f)
    return responses 

In [None]:
## generate predictions starcoder schemapile

In [19]:
model, tokenizer, dialogue_template = initialize_model("starcoder-schemapile")

No dialogue template found in model repo. Defaulting to the `no_system` template.


This model has some weights that should be kept in higher precision, you need to upgrade `accelerate` to properly deal with them (`pip install --upgrade accelerate`).


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [22]:
responses_starcoder_schemapile_spider = generate_responses(prompts_ground_truth_spider,
                               generate_response_as_json_starcoder, "responses_starcoder_schemapile_spider.json")

In [23]:
responses_starcoder_schemapile_bird = generate_responses(prompts_ground_truth_bird,
                               generate_response_as_json_starcoder, "responses_starcoder_schemapile_bird.json")

In [24]:
responses_starcoder_schemapile_ctu = generate_responses(prompts_ground_truth_ctu,
                               generate_response_as_json_starcoder, "responses_starcoder_schemapile_ctu.json")

In [None]:
## generate predictions starcoder alpha

In [16]:
model, tokenizer, dialogue_template = initialize_model("HuggingFaceH4/starchat-alpha")

This model has some weights that should be kept in higher precision, you need to upgrade `accelerate` to properly deal with them (`pip install --upgrade accelerate`).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [25]:
responses_starcoder_alpha_spider = generate_responses(prompts_ground_truth_spider, 
                               generate_response_as_json_starcoder, "responses_starcoder_alpha_spider.json")

In [26]:
responses_starcoder_alpha_bird = generate_responses(prompts_ground_truth_bird, 
                               generate_response_as_json_starcoder, "responses_starcoder_alpha_bird.json")

In [None]:
responses_starcoder_alpha_ctu = generate_responses(prompts_ground_truth_ctu, 
                               generate_response_as_json_starcoder, "responses_starcoder_alpha_ctu.json")

In [None]:
## generate predictions gpt3.5

In [20]:
responses_gpt35_spider = generate_responses(prompts_ground_truth_spider, 
                   generate_response_as_json_gpt35, "responses_gpt35_spider.json")

determining fk's for: college_1_sqlite_class_course
determining fk's for: college_1_sqlite_course_department
determining fk's for: college_1_sqlite_department_employee
determining fk's for: college_1_sqlite_enroll_student
determining fk's for: college_1_sqlite_enroll_class
determining fk's for: college_1_sqlite_professor_department
determining fk's for: college_1_sqlite_professor_employee
determining fk's for: college_1_sqlite_student_department
determining fk's for: perpetrator_sqlite_perpetrator_people
determining fk's for: wine_1_sqlite_wine_appellations
determining fk's for: riding_club_sqlite_coach_club
determining fk's for: riding_club_sqlite_player_coach_coach
determining fk's for: riding_club_sqlite_player_coach_player
determining fk's for: riding_club_sqlite_match_result_club
determining fk's for: insurance_and_eClaims_sqlite_policies_customers
determining fk's for: insurance_and_eClaims_sqlite_claim_headers_policies
determining fk's for: insurance_and_eClaims_sqlite_claims_do

determining fk's for: swimming_sqlite_event_stadium
determining fk's for: swimming_sqlite_record_swimmer
determining fk's for: swimming_sqlite_record_event
determining fk's for: protein_institute_sqlite_institution_building
determining fk's for: protein_institute_sqlite_protein_institution
determining fk's for: geo_sqlite_city_state
determining fk's for: geo_sqlite_highlow_state
determining fk's for: geo_sqlite_mountain_state
determining fk's for: geo_sqlite_river_state
determining fk's for: farm_sqlite_farm_competition_city
determining fk's for: farm_sqlite_competition_record_farm
determining fk's for: farm_sqlite_competition_record_farm_competition
determining fk's for: medicine_enzyme_interaction_sqlite_medicine_enzyme_interaction_medicine
determining fk's for: medicine_enzyme_interaction_sqlite_medicine_enzyme_interaction_enzyme
determining fk's for: tracking_software_problems_sqlite_problem_log_problem_status_codes
determining fk's for: tracking_software_problems_sqlite_problem_lo

In [52]:
responses_gpt35_bird = generate_responses(prompts_ground_truth_bird, 
                   generate_response_as_json_gpt35, "responses_gpt35_bird.json")

determining fk's for: college_completion_sqlite_state_sector_grads_state_sector_details


In [None]:
responses_gpt35_ctu = generate_responses(prompts_ground_truth_ctu, 
                   generate_response_as_json_gpt35, "responses_gpt35_ctu.json")

In [None]:
# evaluation

In [25]:
def evaluate_responses(responses, prompts_ground_truth):
    error = 0
    match = 0
    no_match = 0

    for dataset_name in responses:
        if dataset_name not in prompts_ground_truth:
            print(f"dataset {dataset_name} not found in ground truth, skipping")
            continue
        
        foreign_key_ground_truth = json.loads(prompts_ground_truth[dataset_name]["foreign_key"])
        foreign_key_prediction = responses[dataset_name]
        
        if foreign_key_prediction is None:
            error += 1
            continue
            
        if (list(foreign_key_prediction.keys()) == list(foreign_key_ground_truth.keys()) and
            (list(foreign_key_prediction.values()) == list(foreign_key_ground_truth.values()))):
            match += 1
        else:
            no_match += 1  

    print("match: "+str(match))
    print("no match: "+str(no_match))
    print("error: "+str(error))
    print("success rate: "+str(match/(match+no_match+error)))

In [21]:
evaluate_responses(responses_gpt35_spider, prompts_ground_truth_spider)

match: 458
no match: 65
error: 0
success rate: 0.875717017208413


In [74]:
evaluate_responses(responses_gpt35_bird, prompts_ground_truth_bird)

match: 229
no match: 48
error: 1
success rate: 0.8237410071942446


In [75]:
evaluate_responses(responses_gpt35_ctu, prompts_ground_truth_ctu)

match: 539
no match: 86
error: 0
success rate: 0.8624


In [27]:
evaluate_responses(responses_starcoder_alpha_spider, prompts_ground_truth_spider)

match: 338
no match: 185
error: 0
success rate: 0.6462715105162524


In [28]:
evaluate_responses(responses_starcoder_alpha_bird, prompts_ground_truth_bird)

match: 171
no match: 107
error: 0
success rate: 0.6151079136690647


In [10]:
evaluate_responses(responses_starcoder_alpha_ctu, prompts_ground_truth_ctu)

match: 513
no match: 112
error: 0
success rate: 0.8208


In [26]:
evaluate_responses(responses_starcoder_schemapile_spider, prompts_ground_truth_spider)

match: 506
no match: 17
error: 0
success rate: 0.9674952198852772


In [27]:
evaluate_responses(responses_starcoder_schemapile_bird, prompts_ground_truth_bird)

match: 263
no match: 15
error: 0
success rate: 0.9460431654676259


In [28]:
evaluate_responses(responses_starcoder_schemapile_ctu, prompts_ground_truth_ctu)

match: 609
no match: 16
error: 0
success rate: 0.9744


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# Define the models and group data
models = ['jaccard', 'gpt-3.5', 'starcoder-alpha', 'starcoder-schemapile']
groups = ['spider', 'bird', 'ctu']
group_data = np.array([[0.58, 0.88, 0.64, 0.97], [0.63, 0.82,0.61, 0.95], [0.36, 0.86, 0.82, 0.97]])

# Define a list of colors
colors = sns.color_palette('deep')[0:4]

barWidth = 0.20

# Create bars
for i in range(len(models)):  # for each model
    r = [j + barWidth*i for j in range(len(groups))]
    plt.bar(r, group_data[:, i], width=barWidth, color=colors[i], edgecolor='grey', label=models[i])

# Adding xticks
plt.xlabel('Groups', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(groups))], groups)

# Add ylabel
plt.ylabel('recall@k=1')

# Add a legend
plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)

# Show the plot
plt.show()