In [9]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [10]:
import sys
import os 

WORK_AREA = "../"
os.chdir(WORK_AREA)

paths = ['./src/', './src/api/v6']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [40]:
import os
import openai
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from transformers import GPT2TokenizerFast
from datetime import datetime


In [12]:
openai.organization = os.getenv("OPENAI_API_ORG")
openai.api_key = os.getenv("OPENAI_API_KEY")

## Build train data

In [None]:
import generator

In [8]:
%%time

generator.main(
    k=40000, 
    print_console=False, 
    lang_representations=True, 
    code_representations=True, 
    output_file='build/train_3domains_complex_utterance_to_code_with_intermediate_40k.csv.gz',
    grammar_dir='config/grammar_messages_reminders_weather',
    seed=42, 
)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-05-13 10:51:51 INFO: Downloading default packages for language: en (English) ...
2023-05-13 10:51:52 INFO: File exists: /Users/asaf/stanza_resources/en/default.zip
2023-05-13 10:51:55 INFO: Finished downloading models and saved to /Users/asaf/stanza_resources.
2023-05-13 10:51:55 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Using Stanza version: 1.4.2


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-05-13 10:51:56 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2023-05-13 10:51:56 INFO: Use device: cpu
2023-05-13 10:51:56 INFO: Loading: tokenize
2023-05-13 10:51:56 INFO: Loading: pos
2023-05-13 10:51:56 INFO: Loading: lemma
2023-05-13 10:51:56 INFO: Loading: depparse
2023-05-13 10:51:56 INFO: Done loading processors!


stanza parser created
Progress: |██████████████████████████████████████████████████| 100.0% Complete (40000/40000)
Succesfully saved samples to build/train_3domains_complex_utterance_to_code_with_intermediate_40k.csv.gz


## Building a prompt

**Build prompt from the API docstrings**

In [13]:

def build_spec_prompt(prompt_files_regexp='./config/prompt/**/*.txt'):
    prompt_dict = {}
    prompt_files = glob.glob(prompt_files_regexp)
    for prompt_file in prompt_files:
        key = os.path.basename(prompt_file).split('.')[0].lower()
        with open(prompt_file, "r") as f:
            prompt_dict[key] = f.read()
    
    prompt = ""
    for key, value in prompt_dict.items():
        prompt = prompt + f"# {key.upper()}:\n\n{value}\n\n"
    
    return prompt

In [17]:
spec_prompt = build_spec_prompt(prompt_files_regexp='./config/prompts_messages_reminders_weather/**/*.txt')

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
len(tokenizer(spec_prompt, max_length=51200, truncation=True)["input_ids"])

1725

**Examples prompt**


Building a prompt from the text and code examples generated by the generator

In [18]:
file_path = 'build/train_3domains_complex_utterance_to_code_with_intermediate_40k.csv.gz'
examples_df = pd.read_csv(file_path)
examples_df = examples_df.reset_index()  # make sure indexes pair with number of rows

In [26]:
def build_example_prompt(text, code=None):
    examples_prompt = f"text: \n{text}\n\n"
    examples_prompt += f"code: \n{code}\n\n\n" if code else f"code: \n"
    
    return examples_prompt


def build_examples_prompt(base_prompt, df, limit=10):
    examples_prompt = base_prompt or ""
    for index, row in df[:limit].iterrows():
        examples_prompt += build_example_prompt(text=row['text'], code=row['code'])
    
    return examples_prompt

In [21]:
base_examples_prompt = """
Transform text to code

# EXAMPLES:

"""
examples_prompt = build_examples_prompt(base_examples_prompt, examples_df, limit=10)

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
len(tokenizer(examples_prompt, max_length=51200, truncation=True)["input_ids"])


2903

## Evaluation dataset

In [22]:
file_name = 'eval_complex_utterance_to_code_with_intermediate_82_20230509.csv.gz'
base_path = '/Users/asaf/Workspace/biu/complex-utterance-to-code/build'
eval_df = pd.read_csv(os.path.join(base_path, file_name))
eval_df = eval_df.reset_index()  # make sure indexes pair with number of rows

In [23]:
eval_df["code"].str.len().max()

854

In [24]:
def build_test_code(code: str, imports: str, test: str, code_embed_str: str = '# end code block to test', fail_on_error: bool = False, verbose: str = 'Fatal'):
  try:
    code_insert_idx = test.find(code_embed_str)
    program_code = imports
    program_code += '\n'
    program_code += test[:code_insert_idx]
    program_code += code
    program_code += '\n'
    program_code += test[code_insert_idx:]
  except Exception as e:
    if verbose == 'Error':
      print('[ERROR] Failed to unparse code rep to code\n', e)
    if fail_on_error:
      raise e
    program_code = ''
  finally:
    return program_code

def eval_code(code: str):
  test_results = {}
  try:
    context = {}
    exec(code, context)
    test_results = context.get('test_results', {})
  except AssertionError as e:
    test_results['test_failuers'] = test_results.get('test_failuers', 0) + 1
  except Exception as e:
    test_results['code_failure'] = test_results.get('code_failure', 0) + 1
  
  return test_results
  

def compute_scores(df, index):
  # df = pd.DataFrame(data)
  df = df.set_index(index).sort_index()
  df['correct'] = df['results'].apply(lambda test_results: test_results.get('correct', 0))
  df['incorrect'] = df['results'].apply(lambda test_results: test_results.get('incorrect', 0))
  df['total'] = df['correct'] + df['incorrect']
  df['code_failure'] = df['results'].apply(lambda test_results: test_results.get('code_failure', 0))
  df['score'] = ((1 - df['code_failure']) * (df['correct'] / df['total'])).replace({np.nan: 0})

  return df

## OpenAI Predictions

### List available models

In [187]:
oai_models = openai.Model.list()
print([model_data['id'] for model_data in oai_models['data']])

['babbage', 'davinci', 'text-davinci-edit-001', 'whisper-1', 'babbage-code-search-code', 'text-similarity-babbage-001', 'code-davinci-edit-001', 'text-davinci-001', 'ada', 'babbage-code-search-text', 'babbage-similarity', 'code-search-babbage-text-001', 'text-curie-001', 'code-search-babbage-code-001', 'text-ada-001', 'text-embedding-ada-002', 'text-similarity-ada-001', 'curie-instruct-beta', 'gpt-3.5-turbo', 'ada-code-search-code', 'ada-similarity', 'code-search-ada-text-001', 'text-search-ada-query-001', 'davinci-search-document', 'gpt-3.5-turbo-0301', 'ada-code-search-text', 'text-search-ada-doc-001', 'davinci-instruct-beta', 'text-similarity-curie-001', 'code-search-ada-code-001', 'ada-search-query', 'text-search-davinci-query-001', 'curie-search-query', 'davinci-search-query', 'babbage-search-document', 'ada-search-document', 'gpt-4-0314', 'text-search-curie-query-001', 'text-search-babbage-doc-001', 'gpt-4', 'curie-search-document', 'text-davinci-003', 'text-search-curie-doc-001'

In [220]:
print([model_data['id'] for model_data in oai_models['data'] if 'code' in model_data['id']])

['babbage-code-search-code', 'code-davinci-edit-001', 'babbage-code-search-text', 'code-search-babbage-text-001', 'code-search-babbage-code-001', 'ada-code-search-code', 'code-search-ada-text-001', 'ada-code-search-text', 'code-search-ada-code-001']


<Model model id=babbage at 0x166040810> JSON: {
  "created": 1649358449,
  "id": "babbage",
  "object": "model",
  "owned_by": "openai",
  "parent": null,
  "permission": [
    {
      "allow_create_engine": false,
      "allow_fine_tuning": false,
      "allow_logprobs": true,
      "allow_sampling": true,
      "allow_search_indices": false,
      "allow_view": true,
      "created": 1669085501,
      "group": null,
      "id": "modelperm-49FUp5v084tBB49tC4z8LPH5",
      "is_blocking": false,
      "object": "model_permission",
      "organization": "*"
    }
  ],
  "root": "babbage"
}

### text-davinci-003

In [41]:
MODEL_NAME = 'text-davinci-003'
timestamp_str = datetime.now().strftime('%Y-%m-%d_%H%M%S')

In [35]:
base_example_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = spec_prompt
base_prompt += '\n'
base_prompt += build_examples_prompt(base_example_prompt, examples_df, limit=3)

len(tokenizer(base_prompt, max_length=51200, truncation=True)["input_ids"])

2417

In [37]:
import tqdm


responses = []
for i, row  in tqdm.notebook.tqdm(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt + '\n'
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/82 [00:00<?, ?it/s]

In [43]:
responses_df = pd.DataFrame(responses)
responses_df.to_csv(f'build/openai-{MODEL_NAME}-{file_name}-{timestamp_str}', index=False, compression='gzip')

In [None]:
responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)

0     walmart_availability = Store.check_availabilit...
1     date_time = DateTime.resolve_from_text("tomorr...
2     date_time = DateTime.resolve_from_text("tomorr...
3     music_source = MusicSource.resolve_from_text("...
4     recipient = Recipient.resolve_from_text("Dad")...
                            ...                        
77    spotify_playlist_name = "lofi"\nMediaPlayer.pl...
78    date_time = DateTime.resolve_from_text("tonigh...
79    date_time = DateTime.resolve_from_text("tonigh...
80    date_time_start = DateTime.resolve_from_text("...
81    date_time_tomorrow = DateTime.resolve_from_tex...
Name: choices, Length: 82, dtype: object

#### Evaluating the results

In [44]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep'],
      dtype='object')

In [45]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0390625

In [52]:
scores_df.head()

Unnamed: 0_level_0,index,test_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,generated_code,test_code,results,correct,incorrect,total,code_failure,score
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"content = Content.resolve_from_text(""Pepsi"")\n...",from entities.generic import *\nfrom entities....,{'code_failure': 1},0,0,0,1,0.0
1,1,1_a,a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr...",from entities.generic import *\nfrom entities....,{'code_failure': 1},0,0,0,1,0.0
1,2,1_b,b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr...",from entities.generic import *\nfrom entities....,{'code_failure': 1},0,0,0,1,0.0
2,3,2,,Play the new Taylor Swift album and pull up my...,"album = Album.resolve_from_text(""the new Taylo...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Play ] ] ...,[ Module [ album = Album.resolve_from_text('th...,album = Content.resolve_from_text('the new Tay...,from entities.generic import *\nfrom entities....,{'code_failure': 1},0,0,0,1,0.0
3,4,3_a,a,Send a message to dad if it rains tomorrow.,"date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Body [...,[ Module [ date_time = DateTime.resolve_from_t...,"recipient = Contact.resolve_from_text(""dad"")\n...",from entities.generic import *\nfrom entities....,{'code_failure': 1},0,0,0,1,0.0


In [49]:
scores_df['code_failure'].value_counts()

1    77
0     5
Name: code_failure, dtype: int64

In [51]:
scores_df[['correct', 'incorrect']].value_counts()

correct  incorrect
0        0            78
1        0             2
3        0             1
6        0             1
dtype: int64

In [237]:
MODEL_NAME = 'text-davinci-003'

In [238]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = build_examples_prompt(examples_prompt, examples_df, limit=13)

responses = []
for i, row  in tqdm_notebook(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/82 [00:00<?, ?it/s]

In [191]:
responses_df = pd.DataFrame(responses)
responses_df.to_csv(f'../build/openai-{MODEL_NAME}-{file_name}', index=False, compression='gzip')

In [205]:
responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)

0     walmart_availability = Store.check_availabilit...
1     date_time = DateTime.resolve_from_text("tomorr...
2     date_time = DateTime.resolve_from_text("tomorr...
3     music_source = MusicSource.resolve_from_text("...
4     recipient = Recipient.resolve_from_text("Dad")...
                            ...                        
77    spotify_playlist_name = "lofi"\nMediaPlayer.pl...
78    date_time = DateTime.resolve_from_text("tonigh...
79    date_time = DateTime.resolve_from_text("tonigh...
80    date_time_start = DateTime.resolve_from_text("...
81    date_time_tomorrow = DateTime.resolve_from_tex...
Name: choices, Length: 82, dtype: object

#### Evaluating the results

In [212]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'generated_code'],
      dtype='object')

In [219]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0625

### gpt-4

MODEL_NAME = 'text-gpt4'
model = openai.Model(MODEL_NAME)