In [1]:
%load_ext dotenv
%dotenv

In [2]:
!pwd

/Users/asaf/Workspace/biu/complex-utterance-to-code/notebooks


In [1]:
import sys
import os 

WORK_AREA = '..'
os.chdir(WORK_AREA)

paths = ['./src/', './src/api/v6', './notebooks/src']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [13]:
from typing import Union, List
import openai
import glob
import pandas as pd
import numpy as np
import tqdm
from transformers import GPT2TokenizerFast
import math
import tokenize
from nltk.translate import bleu_score
from datetime import datetime
import time
import json

In [3]:
!pip freeze | grep openai

openai @ file:///home/conda/feedstock_root/build_artifacts/openai_1686159246812/work


In [4]:
openai.organization = os.getenv("OPENAI_API_ORG")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [172]:
file_path = 'build/train_complex_utterance_to_code_with_intermediate_40k.csv.gz'
examples_df = pd.read_csv(file_path)
examples_df = examples_df.reset_index()  # make sure indexes pair with number of rows
examples_df.head(3)

Unnamed: 0,index,text,code,lang_rep,code_rep
0,0,see if find my first reminders that I have a m...,"person_reminded = Contact.resolve_from_text(""m...",[ root\n\t[ S\n\t\t[ Command\n\t\t\t[ Action\n...,\t[ Module\n\t\t[ person_reminded = Contact.re...
1,1,create a reminder at mindnight to close the wi...,"date_time = DateTime.resolve_from_text(""mindni...",[ root\n\t[ S\n\t\t[ Command\n\t\t\t[ Action\n...,\t[ Module\n\t\t[ date_time = DateTime.resolve...
2,2,show route to my office from Northern Mariana ...,"origin = Location.resolve_from_text(""from Nort...",[ root\n\t[ S\n\t\t[ Command\n\t\t\t[ Conditio...,\t[ Module\n\t\t[ origin = Location.resolve_fr...


## Building a prompt

In [161]:
strategies = {
    "text2code": {
        "system_prompt": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into Python code.",
        "user_prompt_examples": "Below are examples of text descriptions and their corresponding Python code implementations.",
        "user_prompt_spec": "Below are API specifications for implementing the description in code.",
        "prediction_prompt": "convert the following text description into Python code:\n",
        "input_column": "text",
        "input_label": "Text",
        "output_column": "code",
        "output_label": "Code",
    },
    "text2rep": {
        "system_prompt": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into a dense variation of an Abstract Syntax Tree (AST) suitable for Python code.",
        "user_prompt_examples": "Below are examples of text descriptions and their corresponding dense ASTs for Python code.",
        "user_prompt_spec": "Below are API specifications for implementing the corresponding dense AST for Python code.",
        "prediction_prompt": "convert the following text description into Python densed AST form:\n",
        "input_column": "text",
        "input_label": "Text",
        "output_column": "code_rep",
        "output_label": "Dense AST",
    },
    "rep2code": {
        "system_prompt": "You are a skilled programmer. You will be provided with a variation of a Universal Dependencies (UD) tree and your task is to convert it into Python code.",
        "user_prompt_examples": "Below are examples of variations of a Universal Dependencies (UD) trees and their corresponding Python code implementations.",
        "user_prompt_spec": "Below are the API specifications for implementing the description in code.",
        "prediction_prompt": "convert the following variation of UD tree representation into Python code:\n",
        "input_column": "lang_rep",
        "input_label": "UD Tree variation",
        "output_column": "code",
        "output_label": "Code",
    },
    "rep2rep": {
        "system_prompt": "You are a skilled programmer. You will be provided with a variation of a Universal Dependencies (UD) tree and your task is to convert it into a dense variation of an Abstract Syntax Tree (AST) suitable for Python code.",
        "user_prompt_examples": "Below are examples of variations of a Universal Dependencies (UD) trees and their corresponding dense ASTs for Python code.\n",
        "user_prompt_spec": "Below are API specifications for implementing the corresponding dense AST for Python code.",
        "prediction_prompt": "convert the following variation of UD tree representation into dense AST for Python code:\n",
        "input_column": "lang_rep",
        "input_label": "UD Tree variation",
        "output_column": "code_rep",
        "output_label": "Dense AST",
    },
}

In [192]:
def build_example_prompt(input_label, input_value, output_label, output_value=None, base_prompt=None):
    input_prompt = f"{base_prompt or ''}{input_label}: {input_value}"
    output_prompt = f"{output_label.capitalize()}:\n{output_value}" if output_value else f"code:\n"
    example_prompt = [
        {
            "role": "user",
            "content": input_prompt
        },
        {
            "role": "assistant",
            "content": output_prompt
        }
    ]
    return example_prompt


def build_examples_prompt(
    df: pd.DataFrame, 
    input_data: object = None, 
    strategy: str="text2code", 
    headless: bool=False, 
    limit: int=10
):    
    if strategy not in strategies:
        raise ValueError(f"Strategy {strategy} not found in {strategies.keys()}")
    
    properties = strategies[strategy]
    
    example_prompts = []
    for index, row in df[:limit].iterrows():
        example_prompt = build_example_prompt(
            input_value=row[properties["input_column"]], 
            input_label=properties["input_label"],
            output_value=row[properties["output_column"]],
            output_label=properties["output_label"],
            base_prompt=properties["user_prompt_examples"] if index == 0 else "",
        )
        example_prompts += example_prompt

    if headless:
        prompt = example_prompts
    else:
        prompt = [{
            "role": "system", 
            "content": properties["system_prompt"]
        }] 
        
        prompt += example_prompts
        
        if input_data is not None:
            input_prompt = build_example_prompt(
                input_value=input_data[properties["input_column"]], 
                input_label=properties["input_label"], 
                output_label=properties["output_label"]
            )
            prompt += [{
                "role": "user",
                "content": "Based on the previous examples, " + properties["prediction_prompt"] + input_prompt[0]["content"]
            }]
    
    return prompt


def build_spec_prompt(
    path: str = './config/prompts/**/*.txt', 
    input_data: object = None, 
    strategy: str= "text2code", 
    examples_df: pd.DataFrame = None, 
    examples_limit: int = 0,
    headless: bool=False, 
):
    if strategy not in strategies:
        raise ValueError(f"Strategy {strategy} not found in {strategies.keys()}")
    
    properties = strategies[strategy]
    
    prompt_dict = {}
    for prompt_file in glob.glob(path):
        key = os.path.basename(prompt_file).split('.')[0].lower()
        with open(prompt_file, "r") as f:
            prompt_dict[key] = f.read()
    
    spec = ""
    for key, value in prompt_dict.items():
        spec += f"# {key.upper()}:\n\n{value}\n\n"
    prompt = properties["user_prompt_spec"] + "\n\n" + spec
    
    spec_prompt = [{
        "role": "user",
        "content": prompt
    },
    {
        "role": "assistant",
        "content": "ok"
    }]
    
    if examples_limit > 0 and examples_df is not None:
        spec_prompt
        spec_prompt += build_examples_prompt(examples_df, strategy=strategy, limit=examples_limit, headless=True)
            
    if headless:
        prompt = spec_prompt
    else:
        prompt = [{
            "role": "system", 
            "content": properties["system_prompt"]
        }] 
        
        prompt += spec_prompt
        
        if input_data is not None:
            input_prompt = build_example_prompt(
                input_value=input_data[properties["input_column"]], 
                input_label=properties["input_label"], 
                output_label=properties["output_label"]
            )
            prompt += [{
                "role": "user",
                "content": "Based on the API spec and previous examples, " + properties["prediction_prompt"] + input_prompt
            }]
    
    
    return prompt


def build_prompt(
    df: pd.DataFrame,
    prompt_type: str = 'examples',
    strategy: str = 'text2code',
    input_data: object = None, 
    examples_df: pd.DataFrame = None, 
    examples_limit: int = 30, 
    chat_format: bool = True
):
    if prompt_type == 'examples':
        prompt = build_examples_prompt(df=df, input_data=input_data, strategy=strategy, limit=examples_limit)
    elif prompt_type == 'apispec':
        prompt = build_spec_prompt(examples_df=df, strategy=strategy, examples_limit=examples_limit)
    
    if not chat_format:
        raise NotImplementedError("Not implemented yet")
    
    return prompt

In [193]:
prompt = build_example_prompt(input_value=examples_df['text'][0], input_label="Text", output_value=examples_df['code'][0], output_label="Code")
print(json.dumps(prompt, indent=2))


[
  {
    "role": "user",
    "content": "Text: see if find my first reminders that I have a meeting at 3pm and there are and see if I got a reminder at mindnight in 2 days to bring the keys"
  },
  {
    "role": "assistant",
    "content": "Code:\nperson_reminded = Contact.resolve_from_text(\"my\")\ncontent = Content.resolve_from_text(\"I have a meeting at 3pm\")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, content=content)\nreminders = first(reminders)\nResponder.respond(response=reminders)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)\n\nperson_reminded = Contact.resolve_from_text(\"I\")\ndate_time = DateTime.resolve_from_text(\"mindnight in 2 days\")\ncontent = Content.resolve_from_text(\"bring the keys\")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, date_time=date_time, content=content)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)"
  }
]


In [196]:
input_data = examples_df.iloc[0]
prompt = build_prompt(df=examples_df, prompt_type='examples', strategy="rep2rep", input_data=input_data, examples_df=examples_df, examples_limit=2)
prompt_str = "".join([p['content'] for p in prompt if p['role'] == 'user'])

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
base_prompt_tokens_len = len(tokenizer(prompt_str, max_length=51200, truncation=True)["input_ids"])
print(f"Base prompt tokens length: {base_prompt_tokens_len}")

Base prompt tokens length: 5885


In [197]:
prompt = build_prompt(df=examples_df, prompt_type='apispec', strategy="rep2rep", examples_df=examples_df, examples_limit=2)
prompt_str = "".join([p['content'] for p in prompt if p['role'] == 'user'])

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
base_prompt_tokens_len = len(tokenizer(prompt_str, max_length=51200, truncation=True)["input_ids"])
print(f"Base prompt tokens length: {base_prompt_tokens_len}")
print(json.dumps(prompt, indent=2))

Base prompt tokens length: 18665
[
  {
    "role": "system",
    "content": "You are a skilled programmer. You will be provided with a variation of a Universal Dependencies (UD) tree and your task is to convert it into a dense variation of an Abstract Syntax Tree (AST) suitable for Python code."
  },
  {
    "role": "user",
    "content": "Below are API specifications for implementing the corresponding dense AST for Python code.\n\n# MAP:\n\nclass Map(Action):\n    \"\"\"\n    The Map class contains all the methods of a virtual assistant agent in the map domain.\n    \"\"\"\n\n    @classmethod\n    def find_on_map(cls, location: Location) -> List[MapEntity]:\n        \"\"\"\n        This class method finds places on the map.\n\n        Parameters\n        ----------\n        location : Location\n            The location to search for\n\n        Returns\n        -------\n        List[MapEntity]\n            A list of places in the form of map entities\n        \"\"\"\n        pass\n\n# 

## OpenAI Predictions

### List available models

In [103]:
oai_models = openai.Model.list()
print([model_data['id'] for model_data in oai_models['data']])

['text-search-babbage-doc-001', 'curie-search-query', 'text-davinci-003', 'text-search-babbage-query-001', 'babbage', 'babbage-search-query', 'text-babbage-001', 'text-similarity-davinci-001', 'davinci-similarity', 'code-davinci-edit-001', 'curie-similarity', 'babbage-search-document', 'curie-instruct-beta', 'text-search-ada-doc-001', 'davinci-instruct-beta', 'whisper-1', 'text-similarity-babbage-001', 'text-search-davinci-doc-001', 'babbage-similarity', 'text-embedding-ada-002', 'davinci-search-query', 'text-similarity-curie-001', 'text-davinci-001', 'text-search-davinci-query-001', 'ada-search-document', 'ada-code-search-code', 'babbage-002', 'davinci-002', 'davinci-search-document', 'curie-search-document', 'gpt-4-0613', 'babbage-code-search-code', 'text-search-ada-query-001', 'code-search-ada-text-001', 'babbage-code-search-text', 'gpt-4-vision-preview', 'code-search-babbage-code-001', 'ada-search-query', 'gpt-3.5-turbo', 'ada-code-search-text', 'tts-1-hd', 'text-search-curie-query

In [104]:
print([model_data['id'] for model_data in oai_models['data'] if 'code' in model_data['id']])

['code-davinci-edit-001', 'ada-code-search-code', 'babbage-code-search-code', 'code-search-ada-text-001', 'babbage-code-search-text', 'code-search-babbage-code-001', 'ada-code-search-text', 'code-search-babbage-text-001', 'code-search-ada-code-001']


In [107]:
print([model_data['id'] for model_data in oai_models['data'] if 'turbo' in model_data['id']])

['gpt-3.5-turbo', 'gpt-3.5-turbo-16k', 'gpt-3.5-turbo-0301', 'gpt-3.5-turbo-16k-0613', 'gpt-3.5-turbo-1106', 'gpt-3.5-turbo-instruct-0914', 'gpt-3.5-turbo-instruct', 'gpt-3.5-turbo-0613']


### text-davinci-003

In [18]:
MODEL_NAME = 'text-davinci-003'

In [23]:
print(build_examples_prompt(examples_prompt, examples_df[:1], limit=15))


Transform text to code

# EXAMPLES:

text: 
see if find my first reminders that I have a meeting at 3pm and there are and see if I got a reminder at mindnight in 2 days to bring the keys

code: 
person_reminded = Contact.resolve_from_text("my")
content = Content.resolve_from_text("I have a meeting at 3pm")
reminders = Reminders.find_reminders(person_reminded=person_reminded, content=content)
reminders = first(reminders)
Responder.respond(response=reminders)
test_reminders = bool(reminders)
Responder.respond(response=test_reminders)

person_reminded = Contact.resolve_from_text("I")
date_time = DateTime.resolve_from_text("mindnight in 2 days")
content = Content.resolve_from_text("bring the keys")
reminders = Reminders.find_reminders(person_reminded=person_reminded, date_time=date_time, content=content)
test_reminders = bool(reminders)
Responder.respond(response=test_reminders)





In [19]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = build_examples_prompt(examples_prompt, examples_df[:1], limit=15)

responses = []
for i, row  in tqdm_notebook(eval_df[:1].iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, row  in tqdm_notebook(eval_df[:1].iterrows(), total=eval_df.shape[0], desc="Processing records"):


Processing records:   0%|          | 0/152 [00:00<?, ?it/s]

InvalidRequestError: This is a chat model and not supported in the v1/completions endpoint. Did you mean to use v1/chat/completions?

In [314]:
date_str = datetime.now().strftime("%Y%m%d-%H%M%S")
responses_file_path = f'./build/openai-{MODEL_NAME}-{date_str}-{file_name}'
responses_file_path

'./build/openai-text-davinci-003-eval_complex_utterance_to_code_with_intermediate_82_20230519.csv.gz'

In [334]:
responses_data = [response['choices'][0]['text'] for response in responses]
eval_oai_df = eval_df.copy()
eval_oai_df['output'] = pd.Series(responses_data)
eval_oai_df.head()

Unnamed: 0,index,test_id,sample_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,output
0,0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"product_name = ProductName.resolve_from_text(""..."
1,1,1_a,1,a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
2,2,1_b,1,b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
3,3,2,2,,Play the new Taylor Swift album and pull up my...,"album = Album.resolve_from_text(""the new Taylo...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Play ] ] ...,[ Module [ album = Album.resolve_from_text('th...,"album_name = AlbumName.resolve_from_text(""the ..."
4,4,3_a,3,a,Send a message to dad if it rains tomorrow.,"date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Body [...,[ Module [ date_time = DateTime.resolve_from_t...,"destination = Contact.resolve_from_text(""dad"")..."


In [323]:

eval_oai_df.to_csv(responses_file_path, index=False, compression='gzip')

In [324]:
eval_oai_df = pd.read_csv(responses_file_path)
eval_oai_df.head()

Unnamed: 0,index,test_id,sample_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,output
0,0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"product_name = ProductName.resolve_from_text(""..."
1,1,1_a,1,a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
2,2,1_b,1,b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
3,3,2,2,,Play the new Taylor Swift album and pull up my...,"album = Album.resolve_from_text(""the new Taylo...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Play ] ] ...,[ Module [ album = Album.resolve_from_text('th...,"album_name = AlbumName.resolve_from_text(""the ..."
4,4,3_a,3,a,Send a message to dad if it rains tomorrow.,"date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Body [...,[ Module [ date_time = DateTime.resolve_from_t...,"destination = Contact.resolve_from_text(""dad"")..."


In [347]:
model_eval(
    responses_file_path,
    compute_humanval=True, 
    compute_bleu=True
)

{'humaneval': {'score': 0.08333333333333333,
  'results':                            code_failure  correct  incorrect  accuracy
  sample_id sample_minor_id                                            
  0         NaN                         0        4          0       1.0
  1         a                           1        0          0       0.0
            b                           1        0          0       0.0
  2         NaN                         1        0          0       0.0
  3         a                           1        0          0       0.0
  ...                                 ...      ...        ...       ...
  104       b                           0        1          0       1.0
  105       NaN                         1        0          0       0.0
  108       NaN                         1        0          0       0.0
  109       a                           1        0          0       0.0
            b                           1        0          0       0.0
  
  [92

In [339]:
results_file_path = responses_file_path
parse_to_code = False
output_column = 'output'

results_df = pd.read_csv(results_file_path, compression='gzip')

results_df['sample_id'] = results_df['sample_id'].astype(int)
results_df.set_index(['sample_id', 'sample_minor_id'], inplace=True)
results_df.sort_index(inplace=True)

code_column = 'generated_code'
results_df[code_column] = results_df[output_column]
    
results_df['test'] = results_df['test'].str.replace("= next(iterator)", "= next(iterator, None)")
results_df[code_column] = results_df[code_column].str.replace(" = ContentType.", " = MessageContentType.")
results_df[code_column] = results_df[code_column].str.replace("Message.", "Messages.")

In [346]:
data = results_df
gold_column = 'code'
generated_column = 'generated_code'
score_id_labels = 'sample_id'
score_column_name: str = 'bleu_score'

eval_results = data.apply(lambda x: eval_bleu(x[gold_column], x[generated_column]), axis=1)
eval_results_df = eval_results.to_frame('bleu_score')
score = eval_results_df.reset_index(drop=False).groupby(score_id_labels)[score_column_name].mean().mean()
dict(score=score, results=eval_results_df)

{'score': 0.4386397592741266,
 'results':                            bleu_score
 sample_id sample_minor_id            
 0         NaN                0.641628
 1         a                  0.535784
           b                  0.521357
 2         NaN                0.437082
 3         a                  0.413056
 ...                               ...
 104       b                  0.679365
 105       NaN                0.299206
 108       NaN                0.365651
 109       a                  0.310716
           b                  0.291676
 
 [92 rows x 1 columns]}

In [335]:
eval_oai_df.set_index(['sample_id', 'sample_minor_id'], inplace=True)

In [348]:
print(eval_oai_df['output'].loc[(1, 'a')])

date_time = DateTime.resolve_from_text("tomorrow morning")
weather_forecasts = Weather.find_weather_forecasts(date_time=date_time)
test_rain = any((weather_forecast.weather_type for weather_forecast in weather_forecasts) == Rain)
te_est_weather_forecasts = bool(weather_forecasts)
Responder.respond(response=test_weather_forecasts)
if test_weather_forecasts and test_rain:
  date_time = DateTime.resolve_from_text("7:30")
  Alarm.create_alarm(date_time=date_time)
else:
  date_time = DateTime.resolve_from_text("8")
  Alarm.create_alarm(date_time=date_time)


In [None]:
date_time = DateTime.resolve_from_text("tomorrow morning")
weather_forecasts = Weather.find_weather_forecasts(date_time=date_time)
test_rain = any((weather_forecast.weather_type for weather_forecast in weather_forecasts) == Rain)
te_est_weather_forecasts = bool(weather_forecasts)
Responder.respond(response=test_weather_forecasts)
if test_weather_forecasts and test_rain:
  date_time = DateTime.resolve_from_text("7:30")
  Alarm.create_alarm(date_time=date_time)
else:
  date_time = DateTime.resolve_from_text("8")
  Alarm.create_alarm(date_time=date_time)

In [337]:
print(eval_oai_df['output'].loc[(105, None)])

event_name = EventName.resolve_from_text("the art festival")
date_time = DateTime.resolve_from_text("this weekend")
events = Calendar.find_events(event_name=event_name, date_time=date_time)
Tickets.purchase_tickets(events=events)

address = Address.resolve_from_text("the address")
Navigation.add_address_to_navigation(address=address)


In [None]:
event_name = EventName.resolve_from_text("the art festival")
date_time = DateTime.resolve_from_text("this weekend")
events = Calendar.find_events(event_name=event_name, date_time=date_time)
Tickets.purchase_tickets(events=events)

address = Address.resolve_from_text("the address")
Navigation.add_address_to_navigation(address=address)

In [343]:
print(eval_oai_df['output'].loc[(55, None)])

person_reminded = Contact.resolve_from_text("me")
date_time = DateTime.resolve_from_text("tomorrow")
contacts = Contact.resolve_many_from_text("Mom and Dad")
content = Content.resolve_from_text("send an email to contacts")
Reminders.create_reminder(person_reminded=person_reminded, date_time=date_


#### Evaluating the results

In [None]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'generated_code'],
      dtype='object')

In [None]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0625

In [237]:
MODEL_NAME = 'text-davinci-003'

In [238]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = build_examples_prompt(examples_prompt, examples_df, limit=13)

responses = []
for i, row  in tqdm_notebook(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/82 [00:00<?, ?it/s]

In [191]:
responses_df = pd.DataFrame(responses)
responses_df.to_csv(f'../build/openai-{MODEL_NAME}-{file_name}', index=False, compression='gzip')

In [205]:
responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)

0     walmart_availability = Store.check_availabilit...
1     date_time = DateTime.resolve_from_text("tomorr...
2     date_time = DateTime.resolve_from_text("tomorr...
3     music_source = MusicSource.resolve_from_text("...
4     recipient = Recipient.resolve_from_text("Dad")...
                            ...                        
77    spotify_playlist_name = "lofi"\nMediaPlayer.pl...
78    date_time = DateTime.resolve_from_text("tonigh...
79    date_time = DateTime.resolve_from_text("tonigh...
80    date_time_start = DateTime.resolve_from_text("...
81    date_time_tomorrow = DateTime.resolve_from_tex...
Name: choices, Length: 82, dtype: object

#### Evaluating the results

In [212]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'generated_code'],
      dtype='object')

In [219]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0625

### gpt-3.5-turbo

In [349]:
MODEL_NAME = 'gpt-3.5-turbo'

In [591]:
k = 1
wait_time_in_seconds = 1
id_labels = ['test_id', 'sample_id', 'sample_minor_id']
prompt_type = 'examples' # 'examples' or 'apispec'

In [592]:
if prompt_type == 'apispec':
    prompt = build_spec_prompt()
    text, code = examples_df.iloc[0][['text', 'code']]
    base_messages = [
        {"role": "user", "content": f"This is the API specification:\n{prompt}"},
        {"role": "assistant", "content": "OK"},
        {"role": "user", "content": f"Transform the following English text to code:\n{text}"},
        {"role": "assistant", "content": code},
    ]
else:
    base_messages = build_examples_prompt(examples_prompt, examples_df, limit=70, chat_format=True)

In [593]:
responses_file_path = f'./build/openai-{MODEL_NAME}-prompt-{prompt_type}-k{k}-{file_name}'
print(responses_file_path)

./build/openai-gpt-3.5-turbo-16k-prompt-examples-k1-eval_complex_utterance_to_code_with_intermediate_152_20230525.csv.gz


In [598]:
eval_df.sort_index(inplace=True)
eval_oai_df = pd.read_csv(responses_file_path, compression='gzip') if os.path.exists(responses_file_path) else eval_df.copy()
eval_oai_df.set_index(id_labels, inplace=True)
eval_oai_df.sort_index(inplace=True)

responses = []
force = False
for i, row  in tqdm.notebook.tqdm(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    index = tuple(row[id_label] for id_label in id_labels)
    if not force and 'output' in eval_oai_df.loc[index] and not pd.isnull(eval_oai_df.loc[index]['output']):
        time.sleep(0.01)
    else:
        response = openai.ChatCompletion.create(
            model=MODEL_NAME, 
            messages= [{"role": "system", "content": "You are a code programmer."}] +
                base_messages + 
                [{"role": "user", "content": f"Transform the following English text to code:\n{row['text']}"}],
            max_tokens=1000,
            n=k
        )
        responses.append(response)
        
        outputs = [x['message']['content'] for x in response['choices']]
        outputs = [output.replace("code:\n", "").strip() for output in outputs]
        ks = list(np.arange(k))
        
        eval_oai_df['output'] = eval_oai_df['output'].astype(object) if 'output' in eval_oai_df.columns else None
        eval_oai_df.loc[:, 'output'].loc[index] = [outputs]
        eval_oai_df['k'] = eval_oai_df['k'].astype(object) if 'k' in eval_oai_df.columns else None
        eval_oai_df.loc[:, 'k'].loc[index] = [ks]
        eval_oai_df = eval_oai_df.explode(['output', 'k'])
        
        eval_oai_df.to_csv(responses_file_path, index=True, compression='gzip')
        
        time.sleep(wait_time_in_seconds)

Processing records:   0%|          | 0/152 [00:00<?, ?it/s]

### gpt-4

#### Examples prompt

In [198]:
MODEL_NAME = 'gpt-4-1106-preview'

In [199]:
n = 100
id_labels = ['sample_id'] #['test_id', 'sample_id', 'sample_minor_id']
prompt_type = 'examples' # 'examples' or 'apispec'
model_id = 'openai_' + MODEL_NAME.replace('-', '_')
strategy = 'rep2rep'
examples_limit = 18

In [200]:
test_df = load_eval_data()
print(test_df.shape)
test_df.head(3)

NameError: name 'load_eval_data' is not defined

In [736]:
results_file_path = f"./build/results/test-{str(test_df.shape[0])}-{model_id}-{strategy}-n{n}-{prompt_type}-ex{examples_limit}.csv.gz"
print(results_file_path)

./build/results/test-152-openai_gpt_4-text2code-n100-examples-ex18.csv.gz


In [737]:
# loading the results file
test_results_df = pd.read_csv(results_file_path, compression='gzip') if os.path.exists(results_file_path) else test_df.copy()
test_results_df.set_index(id_labels, inplace=True)
test_results_df.sort_index(inplace=True)
test_results_df.head(3)

Unnamed: 0_level_0,index,test_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,output,n
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"locations = Location.resolve_many_from_text(""W...",0.0
0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"product_name = ProductName.resolve_from_text(""...",28.0
0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"product_name = ProductName.resolve_from_text(""...",29.0


In [738]:
chat_format = True
prompt = build_prompt(prompt_type=prompt_type, examples_limit=examples_limit, chat_format=chat_format)
prompt_str = "\n".join([p['content'] for p in prompt if p['role'] == 'user'])
print(f"prompt_type: {prompt_type}\nprompt: {base_messages[:1000]}")

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
base_prompt_tokens_len = len(tokenizer(prompt_str, max_length=51200, truncation=True)["input_ids"])
print(f"Base prompt tokens length: {base_prompt_tokens_len}")

prompt_type: examples
prompt: [{'role': 'user', 'content': [{'role': 'user', 'content': 'Text: see if find my first reminders that I have a meeting at 3pm and there are and see if I got a reminder at mindnight in 2 days to bring the keys'}, {'role': 'assistant', 'content': 'Code:\nperson_reminded = Contact.resolve_from_text("my")\ncontent = Content.resolve_from_text("I have a meeting at 3pm")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, content=content)\nreminders = first(reminders)\nResponder.respond(response=reminders)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)\n\nperson_reminded = Contact.resolve_from_text("I")\ndate_time = DateTime.resolve_from_text("mindnight in 2 days")\ncontent = Content.resolve_from_text("bring the keys")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, date_time=date_time, content=content)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)'}, {'role': 

In [742]:
openai_disabled = False
force = False
total_records = test_df.shape[0]
max_tokens = 500
wait_time_in_seconds = 20

# generate predictions
responses = []
print(f"Generating predictions for {total_records} records")
for i, row  in tqdm.notebook.tqdm(test_df.iterrows(), total=total_records, desc="Processing records"):   
    # check to see if we already have a result for this record
    index = tuple(row[id_label] for id_label in id_labels) if len(id_labels) > 1 else row[id_labels[0]]
    index = [index]
    if (not force) and ('output' in test_results_df.loc[index]) and (not any(pd.isnull(test_results_df.loc[index, 'output']))):
        # if we do, then skip this record
        time.sleep(0.1)
    elif not openai_disabled:
        # run the model, if we don't have a result
        messages = [{
            "role": "system", 
            "content": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into Python code. Below are examples of text descriptions and their corresponding Python code implementations."
            # "content": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into Python code. Below is an API spec for implementing the description in code."
        }] + prompt + [{
            "role": "user",
            "content": "Based on the previous examples, convert the following text into Python code:\n" + build_example_prompt(text=row["text"])
            # "content": "Based on the API spec, convert the following text into Python code:\n" + build_example_prompt(text=row["text"])
        }]
        response = openai.ChatCompletion.create(
            model=MODEL_NAME, 
            messages=messages,
            max_tokens=max_tokens,
            n=n
        )
        responses.append(response)
        
        outputs = [x['message']['content'] for x in response['choices']]
        outputs = [output.replace("Code:\n", "").replace("```python", "").replace("```py", "").replace("```", "").strip() for output in outputs]
        ns = list(np.arange(n))
        
        # duplicate the records
        records_to_duplicate = test_results_df.loc[index] # Fetch the records
        duplicated_records = pd.concat([records_to_duplicate] * (n - 1), ignore_index=False) # Duplicate the records
        test_results_df = pd.concat([test_results_df, duplicated_records], ignore_index=False) # Append the duplicated records back to the original DataFrame (optional)
        
        # set values for output and n
        test_results_df['output'] = test_results_df['output'].astype(object) if 'output' in test_results_df.columns else None
        test_results_df.loc[index, 'output'] = outputs * len(records_to_duplicate)
        test_results_df['n'] = test_results_df['n'].astype(object) if 'n' in test_results_df.columns else None
        test_results_df.loc[index, 'n'] = ns * len(records_to_duplicate)
        
        test_results_df.to_csv(results_file_path, index=True, compression='gzip')
        
        time.sleep(wait_time_in_seconds)

Generating predictions for 152 records


Processing records:   0%|          | 0/152 [00:00<?, ?it/s]

#### Spec prompt

In [699]:
MODEL_NAME = 'gpt-4-1106-preview'

n = 100
id_labels = ['sample_id'] #['test_id', 'sample_id', 'sample_minor_id']
prompt_type = 'apispec' # 'examples' or 'apispec'
strategy = 'rep2rep'
model_id = 'openai_' + MODEL_NAME.replace('-', '_')
slug = 'text2code'

test_df = load_eval_data()

results_file_path = f"./build/results/test-{strategy}={str(test_df.shape[0])}-{model_id}-{slug}-n{n}-{prompt_type}-examples2.csv.gz"
test_results_df = pd.read_csv(results_file_path, compression='gzip') if os.path.exists(results_file_path) else test_df.copy()
test_results_df.set_index(id_labels, inplace=True)
test_results_df.sort_index(inplace=True)

In [108]:
prompt = build_prompt(prompt_type=prompt_type, strategy="rep2rep", examples_limit=2, chat_format=chat_format)
prompt

NameError: name 'prompt_type' is not defined

In [701]:
chat_format = True
prompt = build_prompt(prompt_type=prompt_type, examples_limit=42, chat_format=chat_format)
print("prompt", prompt)
prompt_str = "\n".join([p['content'] for p in prompt if p['role'] == 'user'])
print("promtp_str", prompt_str[:100])

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
base_prompt_tokens_len = len(tokenizer(prompt_str, max_length=51200, truncation=True)["input_ids"])
print(f"Base prompt tokens length: {base_prompt_tokens_len}")

prompt [{'role': 'user', 'content': 'Text: see if find my first reminders that I have a meeting at 3pm and there are and see if I got a reminder at mindnight in 2 days to bring the keys'}, {'role': 'assistant', 'content': 'Code:\nperson_reminded = Contact.resolve_from_text("my")\ncontent = Content.resolve_from_text("I have a meeting at 3pm")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, content=content)\nreminders = first(reminders)\nResponder.respond(response=reminders)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)\n\nperson_reminded = Contact.resolve_from_text("I")\ndate_time = DateTime.resolve_from_text("mindnight in 2 days")\ncontent = Content.resolve_from_text("bring the keys")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, date_time=date_time, content=content)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)'}, {'role': 'user', 'content': 'Text: create a reminder at mindn

In [685]:
openai_disabled = False
force = False
total_records = test_df.shape[0]
max_tokens = 500
wait_time_in_seconds = 20

# generate predictions
responses = []
print(f"Generating predictions for {total_records} records")
for i, row  in tqdm.notebook.tqdm(test_df.iterrows(), total=total_records, desc="Processing records"):   
    # check to see if we already have a result for this record
    index = tuple(row[id_label] for id_label in id_labels) if len(id_labels) > 1 else row[id_labels[0]]
    index = [index]
    if (not force) and ('output' in test_results_df.loc[index]) and (not any(pd.isnull(test_results_df.loc[index, 'output']))):
        # if we do, then skip this record
        time.sleep(0.1)
    elif not openai_disabled:
        # run the model, if we don't have a result
        messages = build_prompt(prompt_type=prompt_type, strategy=strategy, examples_limit=42, chat_format=chat_format)
        response = openai.ChatCompletion.create(
            model=MODEL_NAME, 
            messages=messages,
            max_tokens=max_tokens,
            n=n
        )
        responses.append(response)
        
        outputs = [x['message']['content'] for x in response['choices']]
        outputs = [output.replace("Code:\n", "").replace("```python", "").replace("```py", "").replace("```", "").strip() for output in outputs]
        ns = list(np.arange(n))
        
        # duplicate the records
        records_to_duplicate = test_results_df.loc[index] # Fetch the records
        duplicated_records = pd.concat([records_to_duplicate] * (n - 1), ignore_index=False) # Duplicate the records
        test_results_df = pd.concat([test_results_df, duplicated_records], ignore_index=False) # Append the duplicated records back to the original DataFrame (optional)
        
        # set values for output and n
        test_results_df['output'] = test_results_df['output'].astype(object) if 'output' in test_results_df.columns else None
        test_results_df.loc[index, 'output'] = outputs * len(records_to_duplicate)
        test_results_df['n'] = test_results_df['n'].astype(object) if 'n' in test_results_df.columns else None
        test_results_df.loc[index, 'n'] = ns * len(records_to_duplicate)
        
        test_results_df.to_csv(results_file_path, index=True, compression='gzip')
        
        time.sleep(wait_time_in_seconds)

Generating predictions for 152 records


Processing records:   0%|          | 0/152 [00:00<?, ?it/s]

In [676]:
results_file_path

'./build/results/test-152-openai_gpt_4_1106_preview-text2code-n100-apispec.csv.gz'

### gpt-3.5-turbo

#### Examples prompt

In [686]:
MODEL_NAME = 'gpt-3.5-turbo-1106'

In [687]:
n = 100
id_labels = ['sample_id'] #['test_id', 'sample_id', 'sample_minor_id']
prompt_type = 'examples' # 'examples' or 'apispec'
model_id = 'openai_' + MODEL_NAME.replace('-', '_')
slug = 'text2code'

In [688]:
test_df = load_eval_data()
print(test_df.shape)
test_df.head(3)

(152, 10)


Unnamed: 0,index,test_id,sample_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep
0,0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...
1,1,1_a,1,a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...
2,2,1_b,1,b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...


In [689]:
results_file_path = f"./build/results/test-{str(test_df.shape[0])}-{model_id}-{slug}-n{n}-{prompt_type}.csv.gz"
print(results_file_path)

./build/results/test-152-openai_gpt_3.5_turbo_1106-text2code-n100-examples.csv.gz


In [690]:
# loading the results file
test_results_df = pd.read_csv(results_file_path, compression='gzip') if os.path.exists(results_file_path) else test_df.copy()
test_results_df.set_index(id_labels, inplace=True)
test_results_df.sort_index(inplace=True)
test_results_df.head(3)

Unnamed: 0_level_0,index,test_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...
1,1,1_a,a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...
1,2,1_b,b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...


In [697]:
chat_format = True
# prompt = build_prompt(prompt_type=prompt_type, examples_limit=18, chat_format=chat_format)
prompt = build_prompt(prompt_type=prompt_type, examples_limit=90, chat_format=chat_format)
prompt_str = "\n".join([p['content'] for p in prompt if p['role'] == 'user'])
print(f"prompt_type: {prompt_type}\nprompt: {base_messages[:1000]}")

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
base_prompt_tokens_len = len(tokenizer(prompt_str, max_length=51200, truncation=True)["input_ids"])
print(f"Base prompt tokens length: {base_prompt_tokens_len}")

prompt_type: examples
prompt: [{'role': 'user', 'content': [{'role': 'user', 'content': 'Text: see if find my first reminders that I have a meeting at 3pm and there are and see if I got a reminder at mindnight in 2 days to bring the keys'}, {'role': 'assistant', 'content': 'Code:\nperson_reminded = Contact.resolve_from_text("my")\ncontent = Content.resolve_from_text("I have a meeting at 3pm")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, content=content)\nreminders = first(reminders)\nResponder.respond(response=reminders)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)\n\nperson_reminded = Contact.resolve_from_text("I")\ndate_time = DateTime.resolve_from_text("mindnight in 2 days")\ncontent = Content.resolve_from_text("bring the keys")\nreminders = Reminders.find_reminders(person_reminded=person_reminded, date_time=date_time, content=content)\ntest_reminders = bool(reminders)\nResponder.respond(response=test_reminders)'}, {'role': 

In [698]:
openai_disabled = False
force = False
total_records = test_df.shape[0]
max_tokens = 500
wait_time_in_seconds = 20

# generate predictions
responses = []
print(f"Generating predictions for {total_records} records")
for i, row  in tqdm.notebook.tqdm(test_df.iterrows(), total=total_records, desc="Processing records"):   
    # check to see if we already have a result for this record
    index = tuple(row[id_label] for id_label in id_labels) if len(id_labels) > 1 else row[id_labels[0]]
    index = [index]
    if (not force) and ('output' in test_results_df.loc[index]) and (not any(pd.isnull(test_results_df.loc[index, 'output']))):
        # if we do, then skip this record
        time.sleep(0.1)
    elif not openai_disabled:
        # run the model, if we don't have a result
        messages = [{
            "role": "system", 
            "content": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into Python code. Below are examples of text descriptions and their corresponding Python code implementations."
            # "content": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into Python code. Below is an API spec for implementing the description in code."
        }] + prompt + [{
            "role": "user",
            "content": "Based on the previous examples, convert the following text into Python code:\n" + build_example_prompt(text=row["text"])
            # "content": "Based on the API spec, convert the following text into Python code:\n" + build_example_prompt(text=row["text"])
        }]
        response = openai.ChatCompletion.create(
            model=MODEL_NAME, 
            messages=messages,
            max_tokens=max_tokens,
            n=n
        )
        responses.append(response)
        
        outputs = [x['message']['content'] for x in response['choices']]
        outputs = [output.replace("Code:\n", "").replace("```python", "").replace("```py", "").replace("```", "").strip() for output in outputs]
        ns = list(np.arange(n))
        
        # duplicate the records
        records_to_duplicate = test_results_df.loc[index] # Fetch the records
        duplicated_records = pd.concat([records_to_duplicate] * (n - 1), ignore_index=False) # Duplicate the records
        test_results_df = pd.concat([test_results_df, duplicated_records], ignore_index=False) # Append the duplicated records back to the original DataFrame (optional)
        
        # set values for output and n
        test_results_df['output'] = test_results_df['output'].astype(object) if 'output' in test_results_df.columns else None
        test_results_df.loc[index, 'output'] = outputs * len(records_to_duplicate)
        test_results_df['n'] = test_results_df['n'].astype(object) if 'n' in test_results_df.columns else None
        test_results_df.loc[index, 'n'] = ns * len(records_to_duplicate)
        
        test_results_df.to_csv(results_file_path, index=True, compression='gzip')
        
        time.sleep(wait_time_in_seconds)

Generating predictions for 152 records


Processing records:   0%|          | 0/152 [00:00<?, ?it/s]

#### API Spec prompt

In [721]:
MODEL_NAME = 'gpt-3.5-turbo-1106'

n = 100
id_labels = ['sample_id'] #['test_id', 'sample_id', 'sample_minor_id']
prompt_type = 'apispec' # 'examples' or 'apispec'
model_id = 'openai_' + MODEL_NAME.replace('-', '_')
slug = 'text2code'
examples_limit = 2

test_df = load_eval_data()

results_file_path = f"./build/results/test-{str(test_df.shape[0])}-{model_id}-{slug}-n{n}-{prompt_type}-ex{examples_limit}.csv.gz"
test_results_df = pd.read_csv(results_file_path, compression='gzip') if os.path.exists(results_file_path) else test_df.copy()
test_results_df.set_index(id_labels, inplace=True)
test_results_df.sort_index(inplace=True)

In [722]:
chat_format = True
prompt = build_prompt(prompt_type=prompt_type, examples_limit=examples_limit, chat_format=chat_format)
print("prompt", prompt)
prompt_str = "\n".join([p['content'] for p in prompt if p['role'] == 'user'])
print("promtp_str", prompt_str[:100])

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
base_prompt_tokens_len = len(tokenizer(prompt_str, max_length=51200, truncation=True)["input_ids"])
print(f"Base prompt tokens length: {base_prompt_tokens_len}")

prompt [{'role': 'user', 'content': '\n    Convert the following task into Python code.\n    This is the API specification:\n# MAP:\n\nclass Map(Action):\n    """\n    The Map class contains all the methods of a virtual assistant agent in the map domain.\n    """\n\n    @classmethod\n    def find_on_map(cls, location: Location) -> List[MapEntity]:\n        """\n        This class method finds places on the map.\n\n        Parameters\n        ----------\n        location : Location\n            The location to search for\n\n        Returns\n        -------\n        List[MapEntity]\n            A list of places in the form of map entities\n        """\n        pass\n\n# SHOPPING:\n\nclass Shopping(Action):\n    """\n    The Shopping class contains all the methods of a virtual assistant agent in the shopping domain.\n    """\n\n    @classmethod\n    def find_products(\n        cls,\n        product_name: Optional[ProductName] = None,\n        product_attribute: Optional[ProductAttribute] 

In [725]:
openai_disabled = False
force = False
total_records = test_df.shape[0]
max_tokens = 500
wait_time_in_seconds = 20

# generate predictions
responses = []
print(f"Generating predictions for {total_records} records")
for i, row  in tqdm.notebook.tqdm(test_df.iterrows(), total=total_records, desc="Processing records"):   
    # check to see if we already have a result for this record
    index = tuple(row[id_label] for id_label in id_labels) if len(id_labels) > 1 else row[id_labels[0]]
    index = [index]
    if (not force) and ('output' in test_results_df.loc[index]) and (not any(pd.isnull(test_results_df.loc[index, 'output']))):
        # if we do, then skip this record
        time.sleep(0.1)
    elif not openai_disabled:
        # run the model, if we don't have a result
        messages = [{
            "role": "system", 
            # "content": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into Python code. Below are examples of text descriptions and their corresponding Python code implementations."
            "content": "You are a skilled programmer. You will be provided with a text description and your task is to convert it into Python code. Below is an API spec for implementing the description in code."
        }] + prompt + [{
            "role": "user",
            # "content": "Based on the previous examples, convert the following text into Python code:\n" + build_example_prompt(text=row["text"])
            "content": "Based on the API spec and previous examples, convert the following text into Python code:\n" + build_example_prompt(text=row["text"])
        }]
        response = openai.ChatCompletion.create(
            model=MODEL_NAME, 
            messages=messages,
            max_tokens=max_tokens,
            n=n
        )
        responses.append(response)
        
        outputs = [x['message']['content'] for x in response['choices']]
        outputs = [output.replace("Code:\n", "").replace("```python", "").replace("```py", "").replace("```", "").strip() for output in outputs]
        ns = list(np.arange(n))
        
        # duplicate the records
        records_to_duplicate = test_results_df.loc[index] # Fetch the records
        duplicated_records = pd.concat([records_to_duplicate] * (n - 1), ignore_index=False) # Duplicate the records
        test_results_df = pd.concat([test_results_df, duplicated_records], ignore_index=False) # Append the duplicated records back to the original DataFrame (optional)
        
        # set values for output and n
        test_results_df['output'] = test_results_df['output'].astype(object) if 'output' in test_results_df.columns else None
        test_results_df.loc[index, 'output'] = outputs * len(records_to_duplicate)
        test_results_df['n'] = test_results_df['n'].astype(object) if 'n' in test_results_df.columns else None
        test_results_df.loc[index, 'n'] = ns * len(records_to_duplicate)
        
        test_results_df.to_csv(results_file_path, index=True, compression='gzip')
        
        time.sleep(wait_time_in_seconds)

Generating predictions for 152 records


Processing records:   0%|          | 0/152 [00:00<?, ?it/s]