In [1]:
%reload_ext dotenv
%dotenv

In [2]:
!pwd

/Users/asaf/Workspace/biu/complex-utterance-to-code/notebooks


In [3]:
import sys
import os 

WORK_AREA = '/Users/asaf/Workspace/biu/complex-utterance-to-code'
os.chdir(WORK_AREA)

paths = ['./src/', './src/api/v6', './notebooks/src']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [4]:
from typing import Union, List
import openai
import glob
import pandas as pd
import numpy as np
import tqdm
from transformers import GPT2TokenizerFast
import math
import tokenize
from datetime import datetime
import time
import json
import re
import random
from llm.prompts import build_prompt
from llm.open_ai import OpenAIAPI

In [5]:
!pip freeze | grep openai

openai @ file:///home/conda/feedstock_root/build_artifacts/openai_1686159246812/work


In [6]:
openai.organization = os.getenv("OPENAI_API_ORG")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [7]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

## Evaluation data

In [8]:
FILE_NAME = 'eval_complex_utterance_to_code_with_intermediate_152_20231112.csv.gz'
BASE_PATH = '/Users/asaf/Workspace/biu/complex-utterance-to-code/build'

def load_eval_data(file_name: str = FILE_NAME, base_path: str = BASE_PATH, sample: int = 0, random_seed: int = 42) -> pd.DataFrame:
    eval_df = pd.read_csv(os.path.join(base_path, file_name))
    eval_df = eval_df.reset_index()  # make sure indexes pair with number of rows

    if sample > 0:
        sample_ids = eval_df['sample_id'].unique().tolist()
        random.seed(random_seed)
        random_sample_ids = random.sample(sample_ids, sample)
        eval_df = eval_df[eval_df['sample_id'].isin(random_sample_ids)]
        
    return eval_df

## Training data

In [9]:
!pwd

/Users/asaf/Workspace/biu/complex-utterance-to-code


In [10]:
file_path = 'build/train_complex_utterance_to_code_with_intermediate_40k.csv.gz'
examples_df = pd.read_csv(file_path)
examples_df = examples_df.reset_index()  # make sure indexes pair with number of rows
examples_df.head(3)

Unnamed: 0,index,text,code,lang_rep,code_rep
0,0,see if find my first reminders that I have a m...,"person_reminded = Contact.resolve_from_text(""m...",[ root\n\t[ S\n\t\t[ Command\n\t\t\t[ Action\n...,\t[ Module\n\t\t[ person_reminded = Contact.re...
1,1,create a reminder at mindnight to close the wi...,"date_time = DateTime.resolve_from_text(""mindni...",[ root\n\t[ S\n\t\t[ Command\n\t\t\t[ Action\n...,\t[ Module\n\t\t[ date_time = DateTime.resolve...
2,2,show route to my office from Northern Mariana ...,"origin = Location.resolve_from_text(""from Nort...",[ root\n\t[ S\n\t\t[ Command\n\t\t\t[ Conditio...,\t[ Module\n\t\t[ origin = Location.resolve_fr...


## Utils

In [34]:
def generate_predictions(
    test_df: pd.DataFrame,
    test_results_df: pd.DataFrame,
    model_name: str,
    platform: str,
    prompt_args: dict,
    n: int,
    results_file_path: str,
    max_tokens: int = 512,
    wait_time_in_seconds: int = 5,
    step_size: int = 0,
    platform_disabled: bool = False,
    serialize_response: bool = True,
    random_seed: int = 42
):
    open_api = OpenAIAPI(
        organization=os.getenv("OPENAI_API_ORG"),
        api_key=os.getenv("OPENAI_API_KEY")
    )
    total_records = test_df.shape[0]
    step_size = step_size if step_size > 0 else n
    id_labels = test_df.index.names

    # generate predictions
    responses = []
    print(f"Generating predictions for {total_records} records")
    for i, (index, row)  in tqdm.notebook.tqdm(enumerate(test_df.iterrows()), total=total_records, desc="Processing records"):   
        if not test_results_df.empty and index in test_results_df.index and len(test_results_df.loc[index]) == n:
            # if we have a result for this record, skip
            time.sleep(0.1)
            continue
        
        # iterate with step size until we reach a total of n
        batch_steps = list(range(0, n, step_size))
        for j, k in tqdm.notebook.tqdm(enumerate(batch_steps), total=len(batch_steps), leave=False, desc=f"Processing record {i}"):
            ns = list(np.arange(k, min(k + step_size, n)))
            records_to_duplicate = test_df.loc[[index]] # Fetch the records

            if ((not test_results_df.empty) and (index in test_results_df.index) and test_results_df.loc[index, 'n'].isin(ns).sum() == len(ns) * len(records_to_duplicate)) or platform_disabled:
                # if we have a result for this record, skip
                time.sleep(0.1)
                continue

            # remove the ns we already have
            if index in test_results_df.index:
                ns = [x for x in ns if (test_results_df.loc[index, 'n'] != x).all()]
            
            # run the model, if we don't have a result
            seed = 42 + i*len(batch_steps) + j
            args = {**prompt_args, **{"input_data": row, "seed": seed}}
            messages = build_prompt(**args)

            serialize_id = '_'.join([f'{i}{str(j)}' for i, j in list(zip(id_labels, [index] if len(id_labels) == 1 else list(index)))])
            strategy = args['strategy']
            prompt_type = args['prompt_type']
            model_id = model_name.lower().replace('-', '_').replace('/', '_')
            response = open_api.chat_complete(
                model_name=model_name, 
                messages=messages,
                max_tokens=max_tokens,
                n=len(ns),
                serialize_id=serialize_id,
                serialize_path=f'build/results/{platform}/responses/{model_id}/{strategy}/{prompt_type}/'
            )

            outputs = [x['message']['content'] for x in response['choices']]

            # duplicate the records
            copies = len(ns)
            duplicated_records = pd.concat([records_to_duplicate] * copies, ignore_index=False) # Duplicate the records 
            duplicated_records['output'] = outputs * len(records_to_duplicate)
            duplicated_records['n'] = ns * len(records_to_duplicate)
            duplicated_records['input_seed'] = [seed] * copies * len(records_to_duplicate)
            test_results_df = pd.concat([test_results_df, duplicated_records], ignore_index=False) # Append the duplicated records back to the original DataFrame (optional)
            
            test_results_df.to_csv(results_file_path, index=True, compression='gzip')

            time.sleep(wait_time_in_seconds)

## OpenAI Predictions

### List available models

In [35]:
openai.organization = os.getenv("OPENAI_API_ORG")
openai.api_key =os.getenv("OPENAI_API_KEY")
oai_models = openai.Model.list()
print([model_data['id'] for model_data in oai_models['data']])

['gpt-3.5-turbo-16k', 'gpt-3.5-turbo-1106', 'dall-e-3', 'gpt-3.5-turbo-16k-0613', 'dall-e-2', 'text-embedding-3-large', 'whisper-1', 'tts-1-hd-1106', 'tts-1-hd', 'gpt-3.5-turbo', 'gpt-3.5-turbo-0125', 'gpt-4-0613', 'gpt-3.5-turbo-0301', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-instruct-0914', 'gpt-4', 'tts-1', 'davinci-002', 'gpt-3.5-turbo-instruct', 'babbage-002', 'gpt-4-1106-preview', 'gpt-4-vision-preview', 'tts-1-1106', 'gpt-4-0125-preview', 'gpt-4-turbo-preview', 'text-embedding-ada-002', 'text-embedding-3-small']


### Generate predictions

In [36]:
# model_name = 'gpt-3.5-turbo-1106'
model_name = 'gpt-4-0125-preview'
platform = 'open_ai'

In [42]:
def run_eval(
    model_name: str, 
    platform: str, 
    strategy: str, # 'text2rep', 'text2code'
    prompt_type: str = 'apispec', # 'examples' or 'apispec'
    examples_limit: int = 11,
    step_size: int = 0,
    n: int = 200,
    sample: int = 0,
    random_seed: int = 42,
):
    print(f"Running evaluation for {model_name} on {platform}")
    id_labels = ['sample_id'] #['test_id', 'sample_id', 'sample_minor_id']
    model_id = model_name.lower().replace('-', '_').replace('/', '_')

    test_df = load_eval_data(sample=sample, random_seed=random_seed)
    test_df.set_index(id_labels, inplace=True)
    test_df.sort_index(inplace=True)

    timestamp = datetime.now().strftime("%Y%m%d%H%M")
    results_file_path = f"./build/results/{platform}/test-{str(test_df.shape[0])}-{platform}-{model_id}-{strategy}-n{n}-sample{sample}-{prompt_type}-ex{examples_limit}-seed{random_seed}-{timestamp}.csv.gz"
    if os.path.exists(results_file_path):
        print(f"Results file already exists: {results_file_path}")
        return
    
    test_results_df = pd.read_csv(results_file_path, compression='gzip') if os.path.exists(results_file_path) else pd.DataFrame()
    if not os.path.exists(results_file_path):
        print(f"Results file does not exist: {results_file_path}")
        # raise ValueError(f"Results file already exists: {results_file_path}")
    else:
        test_results_df.set_index(id_labels, inplace=True)
        test_results_df.sort_index(inplace=True)

    print(f"Results will be saved to {results_file_path}")

    generate_predictions(
        test_df=test_df,
        test_results_df=test_results_df,
        model_name=model_name,
        platform=platform,
        prompt_args={
            "examples_df": examples_df,
            "prompt_type": prompt_type,
            "strategy": strategy,
            "examples_limit": examples_limit,
            "flattened_prompt": False,
        },
        n=n,
        results_file_path=results_file_path,
        max_tokens=512,
        wait_time_in_seconds=20,
        step_size=step_size,
        platform_disabled=False,
        serialize_response=True,
    )


In [46]:
retries = 0      # Current retry count
max_retries = 20

while retries < max_retries:
    try:
        run_eval(
            model_name=model_name, 
            platform=platform, 
            strategy='text2rep', 
            prompt_type='apispec', 
            examples_limit=11, 
            step_size=100,
            n=200,
            sample=15,
            random_seed=random.randint(0, 100000)
        )
        print("run_eval succeeded")
        break  # Exit the loop if the method succeeds
    except Exception as e:
        print(f"run_eval method failed: {e}")
        retries += 1
        sleep_time = 60
        print(f"Sleeping for {sleep_time} seconds...")
        time.sleep(sleep_time)
        if retries < max_retries:
            print("Retrying...")
        else:
            print("Maximum retries reached, giving up.")

if retries == max_retries:
    print("Failed after maximum retries.")

Running evaluation for gpt-4-0125-preview on open_ai
Results file does not exist: ./build/results/open_ai/test-19-open_ai-gpt_4_0125_preview-text2rep-n200-sample15-apispec-ex11-seed50110-202403181720.csv.gz
Results will be saved to ./build/results/open_ai/test-19-open_ai-gpt_4_0125_preview-text2rep-n200-sample15-apispec-ex11-seed50110-202403181720.csv.gz
Generating predictions for 19 records


Processing records:   0%|          | 0/19 [00:00<?, ?it/s]

Processing record 0:   0%|          | 0/2 [00:00<?, ?it/s]

Processing record 1:   0%|          | 0/2 [00:00<?, ?it/s]

Processing record 2:   0%|          | 0/2 [00:00<?, ?it/s]

Processing record 3:   0%|          | 0/2 [00:00<?, ?it/s]

Processing record 4:   0%|          | 0/2 [00:00<?, ?it/s]

Processing record 5:   0%|          | 0/2 [00:00<?, ?it/s]

### Evaluate

In [44]:
# results_file_path = f"./build/results/open_ai/test-21-open_ai-gpt_3.5_turbo_1106-text2code-n200-sample15-apispec-ex11.csv.gz"
results_file_path = f"./build/results/open_ai/test-21-open_ai-gpt_4_0125_preview-text2code-n200-sample15-apispec-ex11-202403181619.csv.gz"
results_file_path = f"./build/results/open_ai/test-18-open_ai-gpt_4_0125_preview-text2code-n200-sample15-apispec-ex11-seed81537-202403181659.csv.gz"
results_file_path = f"./build/results/open_ai/test-19-open_ai-gpt_4_0125_preview-text2rep-n200-sample15-apispec-ex11-seed50110-202403181720.csv.gz"

In [45]:
from utils.eval_utils import model_eval


print(f'Processing {results_file_path}')

parse_to_code = results_file_path.find('2rep') > 0
result, results_df = model_eval(
    results_file_path=results_file_path, 
    parse_to_code=parse_to_code,
    parse_rules_enabled=True,
)

for pass_k in result['humaneval']:
    print(f"{pass_k}.mean = {result['humaneval'][pass_k].mean()}")
    print(f"{pass_k}.std = {result['humaneval'][pass_k].std()}")
    print()
    failed_results_pct = (result['humaneval'][pass_k] == 0).sum()/len(result['humaneval'][pass_k])
    print(f"{pass_k}.failed_results_pct = {failed_results_pct}")
    success_results_pct = (result['humaneval'][pass_k] == 1).sum()/len(result['humaneval'][pass_k])
    print(f"{pass_k}.success_results_pct = {success_results_pct}")
    print()

total_results = len(results_df)
total_failed_results = (results_df['accuracy'] == 0).sum()
total_success_results = (results_df['accuracy'] == 1).sum()
print(f"failed_results_pct = {total_failed_results / total_results}")
print(f"success_results_pct = {total_success_results / total_results}")

Processing ./build/results/open_ai/test-18-open_ai-gpt_4_0125_preview-text2code-n200-sample15-apispec-ex11-seed81537-202403181659.csv.gz
In humaneval_accuracy_score...
Evaluating test codes...


  0%|          | 0/3600 [00:00<?, ?it/s]

pass@1.mean = 0.36199999999999993
pass@1.std = 0.4360406272028199

pass@1.failed_results_pct = 0.5333333333333333
pass@1.success_results_pct = 0.2

pass@10.mean = 0.4661822332503201
pass@10.std = 0.5158633711750059

pass@10.failed_results_pct = 0.5333333333333333
pass@10.success_results_pct = 0.2

failed_results_pct = 0.6927777777777778
success_results_pct = 0.30722222222222223


In [None]:
"""
./build/results/open_ai/test-21-open_ai-gpt_4_0125_preview-text2code-n200-sample15-apispec-ex11-202403181619.csv.gz

pass@1.mean = 0.38733333333333325
pass@1.std = 0.4200249425700245

pass@1.failed_results_pct = 0.4
pass@1.success_results_pct = 0.13333333333333333

pass@10.mean = 0.5366838121575748
pass@10.std = 0.49673532832914213

pass@10.failed_results_pct = 0.4
pass@10.success_results_pct = 0.13333333333333333

failed_results_pct = 0.714047619047619
success_results_pct = 0.28595238095238096


./build/results/open_ai/test-18-open_ai-gpt_4_0125_preview-text2code-n200-sample15-apispec-ex11-seed81537-202403181659.csv.gz

pass@1.mean = 0.36199999999999993
pass@1.std = 0.4360406272028199

pass@1.failed_results_pct = 0.5333333333333333
pass@1.success_results_pct = 0.2

pass@10.mean = 0.4661822332503201
pass@10.std = 0.5158633711750059

pass@10.failed_results_pct = 0.5333333333333333
pass@10.success_results_pct = 0.2

failed_results_pct = 0.6927777777777778
success_results_pct = 0.30722222222222223
"""