In [264]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [265]:
import sys
import os 

paths = ['../src/', '../src/api/v6']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [171]:
import os
import openai
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from transformers import GPT2TokenizerFast


In [4]:
openai.organization = os.getenv("OPENAI_API_ORG")
openai.api_key = os.getenv("OPENAI_API_KEY")

## Building a prompt

**Build prompt from the API docstrings**

In [230]:

def build_spec_prompt():
    prompt_dict = {}
    for prompt_file in glob.glob('../config/prompts/*.txt'):
        key = os.path.basename(prompt_file).split('.')[0].lower()
        with open(prompt_file, "r") as f:
            prompt_dict[key] = f.read()
    
    prompt = ""
    for key, value in prompt_dict.items():
        prompt = prompt + f"# {key.upper()}:\n\n{value}\n\n"
    
    return prompt

In [263]:
prompt = build_spec_prompt()

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
len(tokenizer(prompt, max_length=51200, truncation=True)["input_ids"])

20659

**Examples prompt**


Building a prompt from the text and code examples generated by the generator

In [232]:
file_path = '/Users/asaf/Workspace/biu/complex-utterance-to-code/build/train_complex_utterance_to_code_with_intermediate_40k.csv.gz'
examples_df = pd.read_csv(file_path)
examples_df = examples_df.reset_index()  # make sure indexes pair with number of rows

In [233]:
def build_example_prompt(text, code=None):
    examples_prompt = f"text: \n{text}\n\n"
    examples_prompt += f"code: \n{code}\n\n\n" if code else f"code: \n"
    
    return examples_prompt


def build_examples_prompt(examples_prompt, df, limit=10):
    examples_prompt = examples_prompt or ""
    for index, row in df[:limit].iterrows():
        examples_prompt += build_example_prompt(text=row['text'], code=row['code'])
    
    return examples_prompt

In [262]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
prompt = build_examples_prompt(examples_prompt, examples_df, limit=18)

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
len(tokenizer(prompt, max_length=51200, truncation=True)["input_ids"])


3417

## Evaluation dataset

In [222]:
file_name = 'eval_complex_utterance_to_code_with_intermediate_82_20230509.csv.gz'
base_path = '/Users/asaf/Workspace/biu/complex-utterance-to-code/build'
eval_df = pd.read_csv(os.path.join(base_path, file_name))
eval_df = eval_df.reset_index()  # make sure indexes pair with number of rows

In [89]:
eval_df["code"].str.len().max()

854

In [119]:
def build_test_code(code: str, imports: str, test: str, code_embed_str: str = '# end code block to test', fail_on_error: bool = False, verbose: str = 'Fatal'):
  try:
    code_insert_idx = test.find(code_embed_str)
    program_code = imports
    program_code += '\n'
    program_code += test[:code_insert_idx]
    program_code += code
    program_code += '\n'
    program_code += test[code_insert_idx:]
  except Exception as e:
    if verbose == 'Error':
      print('[ERROR] Failed to unparse code rep to code\n', e)
    if fail_on_error:
      raise e
    program_code = ''
  finally:
    return program_code

def eval_code(code: str):
  test_results = {}
  try:
    context = {}
    exec(code, context)
    test_results = context.get('test_results', {})
  except AssertionError as e:
    test_results['test_failuers'] = test_results.get('test_failuers', 0) + 1
  except Exception as e:
    test_results['code_failure'] = test_results.get('code_failure', 0) + 1
  
  return test_results
  

def compute_scores(df, index):
  # df = pd.DataFrame(data)
  df = df.set_index(index).sort_index()
  df['correct'] = df['results'].apply(lambda test_results: test_results.get('correct', 0))
  df['incorrect'] = df['results'].apply(lambda test_results: test_results.get('incorrect', 0))
  df['total'] = df['correct'] + df['incorrect']
  df['code_failure'] = df['results'].apply(lambda test_results: test_results.get('code_failure', 0))
  df['score'] = ((1 - df['code_failure']) * (df['correct'] / df['total'])).replace({np.nan: 0})

  return df

## OpenAI Predictions

### List available models

In [187]:
oai_models = openai.Model.list()
print([model_data['id'] for model_data in oai_models['data']])

['babbage', 'davinci', 'text-davinci-edit-001', 'whisper-1', 'babbage-code-search-code', 'text-similarity-babbage-001', 'code-davinci-edit-001', 'text-davinci-001', 'ada', 'babbage-code-search-text', 'babbage-similarity', 'code-search-babbage-text-001', 'text-curie-001', 'code-search-babbage-code-001', 'text-ada-001', 'text-embedding-ada-002', 'text-similarity-ada-001', 'curie-instruct-beta', 'gpt-3.5-turbo', 'ada-code-search-code', 'ada-similarity', 'code-search-ada-text-001', 'text-search-ada-query-001', 'davinci-search-document', 'gpt-3.5-turbo-0301', 'ada-code-search-text', 'text-search-ada-doc-001', 'davinci-instruct-beta', 'text-similarity-curie-001', 'code-search-ada-code-001', 'ada-search-query', 'text-search-davinci-query-001', 'curie-search-query', 'davinci-search-query', 'babbage-search-document', 'ada-search-document', 'gpt-4-0314', 'text-search-curie-query-001', 'text-search-babbage-doc-001', 'gpt-4', 'curie-search-document', 'text-davinci-003', 'text-search-curie-doc-001'

In [220]:
print([model_data['id'] for model_data in oai_models['data'] if 'code' in model_data['id']])

['babbage-code-search-code', 'code-davinci-edit-001', 'babbage-code-search-text', 'code-search-babbage-text-001', 'code-search-babbage-code-001', 'ada-code-search-code', 'code-search-ada-text-001', 'ada-code-search-text', 'code-search-ada-code-001']


<Model model id=babbage at 0x166040810> JSON: {
  "created": 1649358449,
  "id": "babbage",
  "object": "model",
  "owned_by": "openai",
  "parent": null,
  "permission": [
    {
      "allow_create_engine": false,
      "allow_fine_tuning": false,
      "allow_logprobs": true,
      "allow_sampling": true,
      "allow_search_indices": false,
      "allow_view": true,
      "created": 1669085501,
      "group": null,
      "id": "modelperm-49FUp5v084tBB49tC4z8LPH5",
      "is_blocking": false,
      "object": "model_permission",
      "organization": "*"
    }
  ],
  "root": "babbage"
}

### text-davinci-003

In [248]:
MODEL_NAME = 'text-davinci-003'

In [253]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = build_examples_prompt(examples_prompt, examples_df, limit=25)

responses = []
for i, row  in tqdm_notebook(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    print(len(prompt.split()) + 1000)
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/82 [00:00<?, ?it/s]

4251


InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 11565 tokens (10565 in your prompt; 1000 for the completion). Please reduce your prompt; or completion length.

In [241]:
responses = []
samples_df = eval_df[:2]
for i, row  in tqdm_notebook(samples_df.iterrows(), total=samples_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/2 [00:00<?, ?it/s]

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 16622 tokens (15622 in your prompt; 1000 for the completion). Please reduce your prompt; or completion length.

In [None]:
responses_df = pd.DataFrame(responses)
responses_df.to_csv(f'../build/openai-{MODEL_NAME}-{file_name}', index=False, compression='gzip')

In [None]:
responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)

0     walmart_availability = Store.check_availabilit...
1     date_time = DateTime.resolve_from_text("tomorr...
2     date_time = DateTime.resolve_from_text("tomorr...
3     music_source = MusicSource.resolve_from_text("...
4     recipient = Recipient.resolve_from_text("Dad")...
                            ...                        
77    spotify_playlist_name = "lofi"\nMediaPlayer.pl...
78    date_time = DateTime.resolve_from_text("tonigh...
79    date_time = DateTime.resolve_from_text("tonigh...
80    date_time_start = DateTime.resolve_from_text("...
81    date_time_tomorrow = DateTime.resolve_from_tex...
Name: choices, Length: 82, dtype: object

#### Evaluating the results

In [None]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'generated_code'],
      dtype='object')

In [None]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0625

In [237]:
MODEL_NAME = 'text-davinci-003'

In [238]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = build_examples_prompt(examples_prompt, examples_df, limit=13)

responses = []
for i, row  in tqdm_notebook(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/82 [00:00<?, ?it/s]

In [191]:
responses_df = pd.DataFrame(responses)
responses_df.to_csv(f'../build/openai-{MODEL_NAME}-{file_name}', index=False, compression='gzip')

In [205]:
responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)

0     walmart_availability = Store.check_availabilit...
1     date_time = DateTime.resolve_from_text("tomorr...
2     date_time = DateTime.resolve_from_text("tomorr...
3     music_source = MusicSource.resolve_from_text("...
4     recipient = Recipient.resolve_from_text("Dad")...
                            ...                        
77    spotify_playlist_name = "lofi"\nMediaPlayer.pl...
78    date_time = DateTime.resolve_from_text("tonigh...
79    date_time = DateTime.resolve_from_text("tonigh...
80    date_time_start = DateTime.resolve_from_text("...
81    date_time_tomorrow = DateTime.resolve_from_tex...
Name: choices, Length: 82, dtype: object

#### Evaluating the results

In [212]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'generated_code'],
      dtype='object')

In [219]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0625

### gpt-4

MODEL_NAME = 'text-gpt4'
model = openai.Model(MODEL_NAME)

In [271]:
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},
                  index=['falcon', 'dog', 'spider', 'fish'])
df2 = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},)
df

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [272]:
pd.concat([df, df2], axis=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen,num_legs.1,num_wings.1,num_specimen_seen.1
falcon,2.0,2.0,10.0,,,
dog,4.0,0.0,2.0,,,
spider,8.0,0.0,1.0,,,
fish,0.0,0.0,8.0,,,
0,,,,2.0,2.0,10.0
1,,,,4.0,0.0,2.0
2,,,,8.0,0.0,1.0
3,,,,0.0,0.0,8.0


In [None]:
import evaluate

module = evaluate.load("dvitel/codebleu")
src = 'class AcidicSwampOoze(MinionCard):§    def __init__(self):§        super().__init__("Acidic Swamp Ooze", 2, CHARACTER_CLASS.ALL, CARD_RARITY.COMMON, battlecry=Battlecry(Destroy(), WeaponSelector(EnemyPlayer())))§§    def create_minion(self, player):§        return Minion(3, 2)§'
tgt = 'class AcidSwampOoze(MinionCard):§    def __init__(self):§        super().__init__("Acidic Swamp Ooze", 2, CHARACTER_CLASS.ALL, CARD_RARITY.COMMON, battlecry=Battlecry(Destroy(), WeaponSelector(EnemyPlayer())))§§    def create_minion(self, player):§        return Minion(3, 2)§'
src = src.replace("§","\n")
tgt = tgt.replace("§","\n")
res = module.compute(predictions = [tgt], references = [[src]])
print(res)