In [257]:
%load_ext dotenv
%dotenv

In [264]:
import sys
import os 

WORK_AREA = '..'
os.chdir(WORK_AREA)

paths = ['./src/', './src/api/v6']
for path in paths:
    path = os.path.normcase(path)
    if not any(os.path.normcase(sp) == path for sp in sys.path):
        sys.path.append(path)

In [293]:
from typing import Union, List
import openai
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from transformers import GPT2TokenizerFast
import math
import tokenize
from nltk.translate import bleu_score


In [266]:
openai.organization = os.getenv("OPENAI_API_ORG")
openai.api_key = os.getenv("OPENAI_API_KEY")

## Building a prompt

**Build prompt from the API docstrings**

In [282]:

def build_spec_prompt():
    prompt_dict = {}
    for prompt_file in glob.glob('./config/prompts/**/*.txt'):
        key = os.path.basename(prompt_file).split('.')[0].lower()
        with open(prompt_file, "r") as f:
            prompt_dict[key] = f.read()
    
    prompt = ""
    for key, value in prompt_dict.items():
        prompt = prompt + f"# {key.upper()}:\n\n{value}\n\n"
    
    return prompt

In [284]:
prompt = build_spec_prompt()

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
len(tokenizer(prompt, max_length=51200, truncation=True)["input_ids"])

17187

**Examples prompt**


Building a prompt from the text and code examples generated by the generator

In [285]:
file_path = 'build/train_complex_utterance_to_code_with_intermediate_40k.csv.gz'
examples_df = pd.read_csv(file_path)
examples_df = examples_df.reset_index()  # make sure indexes pair with number of rows

In [286]:
def build_example_prompt(text, code=None):
    examples_prompt = f"text: \n{text}\n\n"
    examples_prompt += f"code: \n{code}\n\n\n" if code else f"code: \n"
    
    return examples_prompt


def build_examples_prompt(examples_prompt, df, limit=10):
    examples_prompt = examples_prompt or ""
    for index, row in df[:limit].iterrows():
        examples_prompt += build_example_prompt(text=row['text'], code=row['code'])
    
    return examples_prompt

In [287]:
examples_prompt = """
Transform text to Python code

# EXAMPLES:

"""
prompt = build_examples_prompt(examples_prompt, examples_df, limit=18)

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
len(tokenizer(prompt, max_length=51200, truncation=True)["input_ids"])


3420

## Evaluation dataset

In [288]:
file_name = 'eval_complex_utterance_to_code_with_intermediate_82_20230519.csv.gz'
base_path = '/Users/asaf/Workspace/biu/complex-utterance-to-code/build'
eval_df = pd.read_csv(os.path.join(base_path, file_name))
eval_df = eval_df.reset_index()  # make sure indexes pair with number of rows

In [289]:
eval_df["code"].str.len().max()

854

In [344]:
def build_test_code(code: str, imports: str, test: str, code_embed_str: str = '# end code block to test', fail_on_error: bool = False, verbose: str = 'Fatal'):
  try:
    code_insert_idx = test.find(code_embed_str)
    program_code = imports
    program_code += '\n'
    program_code += test[:code_insert_idx]
    program_code += code
    program_code += '\n'
    program_code += test[code_insert_idx:]
  except Exception as e:
    if verbose == 'Error':
      print('[ERROR] Failed to unparse code rep to code\n', e)
    if fail_on_error:
      raise e
    program_code = ''
  finally:
    return program_code
  
  
def tokenize_source(code):
    file_path = "/tmp/example.py"

    with open(file_path, "w") as text_file:
        text_file.write(code)
        
    with open(file_path, 'rb') as f:
        tokens_gen = tokenize.tokenize(f.readline)
        tokens = []
        try:
          for token in tokens_gen:
            tokens.append(token.string)
        except Exception as e:
          pass
        
    os.remove(file_path)
    return tokens


def eval_code(code: str):
  test_results = {}
  try:
    context = {}
    exec(code, context)
    test_results = context.get('test_results', {})
  except AssertionError as e:
    test_results['test_failuers'] = test_results.get('test_failuers', 0) + 1
  except Exception as e:
    test_results['code_failure'] = test_results.get('code_failure', 0) + 1

  code_failure = test_results.get('code_failure', 0)
  correct = test_results.get('correct', 0)
  incorrect = test_results.get('incorrect', 0)
  total = (correct + incorrect) or math.inf
  accuracy = (1 - code_failure) * (correct / total)

  results = dict(
    code_failure = code_failure,
    correct = correct,
    incorrect = incorrect,
    accuracy = accuracy,
  )

  return results


def eval_bleu(code, generated_code):
  hypothesis = tokenize_source(code)
  reference = tokenize_source(generated_code)
  weights = (0.25, 0.25, 0.25, 0.25)
  score = bleu_score.sentence_bleu([reference], hypothesis, weights=weights)
  return score


def humaneval_accuracy_score(
    data: pd.DataFrame, 
    code_column_name: str = 'pred_code', 
    score_id_labels: Union[str, List[str]] = 'sample_id', 
    score_column_name: str = 'accuracy', 
):
    test_codes = data.apply(lambda x: build_test_code(code=x[code_column_name], imports=x['imports'], test=x['test']), axis=1)
    test_results = test_codes.apply(lambda test_code: eval_code(test_code))
    test_results_df = pd.DataFrame.from_records(
        test_results.values, index=test_results.index
    )
    score = test_results_df.reset_index(drop=False).groupby(score_id_labels)[score_column_name].mean().mean()
    return dict(score=score, results=test_results_df)


def bleu_accuracy_score(
    data: pd.DataFrame, 
    generated_column='output', 
    gold_column='code',
    score_id_labels: Union[str, List[str]] = 'sample_id', 
    score_column_name: str = 'bleu_score', 
):
    eval_results = data.apply(lambda x: eval_bleu(x[gold_column], x[generated_column]), axis=1)
    eval_results_df = eval_results.to_frame('bleu_score')
    score = eval_results_df.reset_index(drop=False).groupby(score_id_labels)[score_column_name].mean().mean()
    return dict(score=score, results=eval_results_df)
  
  
def model_eval(
    results_file_path, 
    output_column='output', 
    gold_column='code', 
    parse_to_code=False, 
    compute_humanval=True, 
    compute_bleu=True
):
    results_df = pd.read_csv(results_file_path, compression='gzip')
    
    results_df['sample_id'] = results_df['sample_id'].astype(int)
    results_df.set_index(['sample_id', 'sample_minor_id'], inplace=True)
    results_df.sort_index(inplace=True)
    
    code_column = 'generated_code'
    if parse_to_code:
        results_df[code_column] = results_df[output_column].apply(lambda x: parse_code_rep_to_code(x))
    else: 
        results_df[code_column] = results_df[output_column]
        
    results_df['test'] = results_df['test'].str.replace("= next(iterator)", "= next(iterator, None)")
    results_df[code_column] = results_df[code_column].str.replace(" = ContentType.", " = MessageContentType.")
    results_df[code_column] = results_df[code_column].str.replace("Message.", "Messages.")

    humaneval_results = humaneval_accuracy_score(
        data=results_df, 
        code_column_name=code_column) if compute_humanval else None
    
    bleu_results = bleu_accuracy_score(
        data=results_df, 
        generated_column=code_column, 
        gold_column=gold_column) if compute_bleu else None
    
    results = dict(
        humaneval = humaneval_results,
        bleu = bleu_results
    )
    return results
  
  

## OpenAI Predictions

### List available models

In [295]:
oai_models = openai.Model.list()
print([model_data['id'] for model_data in oai_models['data']])

['whisper-1', 'babbage', 'gpt-3.5-turbo', 'davinci', 'text-davinci-edit-001', 'text-davinci-003', 'babbage-code-search-code', 'text-similarity-babbage-001', 'code-davinci-edit-001', 'text-davinci-001', 'ada', 'babbage-code-search-text', 'babbage-similarity', 'code-search-babbage-text-001', 'text-curie-001', 'gpt-4', 'code-search-babbage-code-001', 'text-ada-001', 'text-embedding-ada-002', 'text-similarity-ada-001', 'curie-instruct-beta', 'gpt-4-0314', 'ada-code-search-code', 'ada-similarity', 'code-search-ada-text-001', 'text-search-ada-query-001', 'davinci-search-document', 'ada-code-search-text', 'text-search-ada-doc-001', 'davinci-instruct-beta', 'text-similarity-curie-001', 'code-search-ada-code-001', 'ada-search-query', 'text-search-davinci-query-001', 'curie-search-query', 'davinci-search-query', 'babbage-search-document', 'ada-search-document', 'text-search-curie-query-001', 'text-search-babbage-doc-001', 'curie-search-document', 'text-search-curie-doc-001', 'babbage-search-quer

In [296]:
print([model_data['id'] for model_data in oai_models['data'] if 'code' in model_data['id']])

['babbage-code-search-code', 'code-davinci-edit-001', 'babbage-code-search-text', 'code-search-babbage-text-001', 'code-search-babbage-code-001', 'ada-code-search-code', 'code-search-ada-text-001', 'ada-code-search-text', 'code-search-ada-code-001']


### text-davinci-003

In [298]:
MODEL_NAME = 'text-davinci-003'

In [312]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = build_examples_prompt(examples_prompt, examples_df, limit=15)

responses = []
for i, row  in tqdm_notebook(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/92 [00:00<?, ?it/s]

In [314]:
responses_file_path = f'./build/openai-{MODEL_NAME}-{file_name}'
responses_file_path

'./build/openai-text-davinci-003-eval_complex_utterance_to_code_with_intermediate_82_20230519.csv.gz'

In [334]:
responses_data = [response['choices'][0]['text'] for response in responses]
eval_oai_df = eval_df.copy()
eval_oai_df['output'] = pd.Series(responses_data)
eval_oai_df.head()

Unnamed: 0,index,test_id,sample_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,output
0,0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"product_name = ProductName.resolve_from_text(""..."
1,1,1_a,1,a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
2,2,1_b,1,b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
3,3,2,2,,Play the new Taylor Swift album and pull up my...,"album = Album.resolve_from_text(""the new Taylo...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Play ] ] ...,[ Module [ album = Album.resolve_from_text('th...,"album_name = AlbumName.resolve_from_text(""the ..."
4,4,3_a,3,a,Send a message to dad if it rains tomorrow.,"date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Body [...,[ Module [ date_time = DateTime.resolve_from_t...,"destination = Contact.resolve_from_text(""dad"")..."


In [323]:

eval_oai_df.to_csv(responses_file_path, index=False, compression='gzip')

In [324]:
eval_oai_df = pd.read_csv(responses_file_path)
eval_oai_df.head()

Unnamed: 0,index,test_id,sample_id,sample_minor_id,text,code,test,imports,lang_rep,code_rep,output
0,0,0,0,,Check the availability of Pepsi at Walmart and...,"product_name = ProductName.resolve_from_text(""...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Check ] ]...,[ Module [ product_name = ProductName.resolve_...,"product_name = ProductName.resolve_from_text(""..."
1,1,1_a,1,a,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
2,2,1_b,1,b,"If it's raining tomorrow morning, set my alarm...","date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Test [...,[ Module [ date_time = DateTime.resolve_from_t...,"date_time = DateTime.resolve_from_text(""tomorr..."
3,3,2,2,,Play the new Taylor Swift album and pull up my...,"album = Album.resolve_from_text(""the new Taylo...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Action [ hd [ Play ] ] ...,[ Module [ album = Album.resolve_from_text('th...,"album_name = AlbumName.resolve_from_text(""the ..."
4,4,3_a,3,a,Send a message to dad if it rains tomorrow.,"date_time = DateTime.resolve_from_text(""tomorr...",# test data\ndata_model = DataModel(reset=True...,from entities.generic import *\nfrom entities....,[ root [ S [ Command [ Condition [ If [ Body [...,[ Module [ date_time = DateTime.resolve_from_t...,"destination = Contact.resolve_from_text(""dad"")..."


In [347]:
model_eval(
    responses_file_path,
    compute_humanval=True, 
    compute_bleu=True
)

{'humaneval': {'score': 0.08333333333333333,
  'results':                            code_failure  correct  incorrect  accuracy
  sample_id sample_minor_id                                            
  0         NaN                         0        4          0       1.0
  1         a                           1        0          0       0.0
            b                           1        0          0       0.0
  2         NaN                         1        0          0       0.0
  3         a                           1        0          0       0.0
  ...                                 ...      ...        ...       ...
  104       b                           0        1          0       1.0
  105       NaN                         1        0          0       0.0
  108       NaN                         1        0          0       0.0
  109       a                           1        0          0       0.0
            b                           1        0          0       0.0
  
  [92

In [339]:
results_file_path = responses_file_path
parse_to_code = False
output_column = 'output'

results_df = pd.read_csv(results_file_path, compression='gzip')

results_df['sample_id'] = results_df['sample_id'].astype(int)
results_df.set_index(['sample_id', 'sample_minor_id'], inplace=True)
results_df.sort_index(inplace=True)

code_column = 'generated_code'
results_df[code_column] = results_df[output_column]
    
results_df['test'] = results_df['test'].str.replace("= next(iterator)", "= next(iterator, None)")
results_df[code_column] = results_df[code_column].str.replace(" = ContentType.", " = MessageContentType.")
results_df[code_column] = results_df[code_column].str.replace("Message.", "Messages.")

In [346]:
data = results_df
gold_column = 'code'
generated_column = 'generated_code'
score_id_labels = 'sample_id'
score_column_name: str = 'bleu_score'

eval_results = data.apply(lambda x: eval_bleu(x[gold_column], x[generated_column]), axis=1)
eval_results_df = eval_results.to_frame('bleu_score')
score = eval_results_df.reset_index(drop=False).groupby(score_id_labels)[score_column_name].mean().mean()
dict(score=score, results=eval_results_df)

{'score': 0.4386397592741266,
 'results':                            bleu_score
 sample_id sample_minor_id            
 0         NaN                0.641628
 1         a                  0.535784
           b                  0.521357
 2         NaN                0.437082
 3         a                  0.413056
 ...                               ...
 104       b                  0.679365
 105       NaN                0.299206
 108       NaN                0.365651
 109       a                  0.310716
           b                  0.291676
 
 [92 rows x 1 columns]}

In [335]:
eval_oai_df.set_index(['sample_id', 'sample_minor_id'], inplace=True)

In [348]:
print(eval_oai_df['output'].loc[(1, 'a')])

date_time = DateTime.resolve_from_text("tomorrow morning")
weather_forecasts = Weather.find_weather_forecasts(date_time=date_time)
test_rain = any((weather_forecast.weather_type for weather_forecast in weather_forecasts) == Rain)
te_est_weather_forecasts = bool(weather_forecasts)
Responder.respond(response=test_weather_forecasts)
if test_weather_forecasts and test_rain:
  date_time = DateTime.resolve_from_text("7:30")
  Alarm.create_alarm(date_time=date_time)
else:
  date_time = DateTime.resolve_from_text("8")
  Alarm.create_alarm(date_time=date_time)


In [None]:
date_time = DateTime.resolve_from_text("tomorrow morning")
weather_forecasts = Weather.find_weather_forecasts(date_time=date_time)
test_rain = any((weather_forecast.weather_type for weather_forecast in weather_forecasts) == Rain)
te_est_weather_forecasts = bool(weather_forecasts)
Responder.respond(response=test_weather_forecasts)
if test_weather_forecasts and test_rain:
  date_time = DateTime.resolve_from_text("7:30")
  Alarm.create_alarm(date_time=date_time)
else:
  date_time = DateTime.resolve_from_text("8")
  Alarm.create_alarm(date_time=date_time)

In [337]:
print(eval_oai_df['output'].loc[(105, None)])

event_name = EventName.resolve_from_text("the art festival")
date_time = DateTime.resolve_from_text("this weekend")
events = Calendar.find_events(event_name=event_name, date_time=date_time)
Tickets.purchase_tickets(events=events)

address = Address.resolve_from_text("the address")
Navigation.add_address_to_navigation(address=address)


In [None]:
event_name = EventName.resolve_from_text("the art festival")
date_time = DateTime.resolve_from_text("this weekend")
events = Calendar.find_events(event_name=event_name, date_time=date_time)
Tickets.purchase_tickets(events=events)

address = Address.resolve_from_text("the address")
Navigation.add_address_to_navigation(address=address)

In [343]:
print(eval_oai_df['output'].loc[(55, None)])

person_reminded = Contact.resolve_from_text("me")
date_time = DateTime.resolve_from_text("tomorrow")
contacts = Contact.resolve_many_from_text("Mom and Dad")
content = Content.resolve_from_text("send an email to contacts")
Reminders.create_reminder(person_reminded=person_reminded, date_time=date_


#### Evaluating the results

In [None]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'generated_code'],
      dtype='object')

In [None]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0625

In [237]:
MODEL_NAME = 'text-davinci-003'

In [238]:
examples_prompt = """
Transform text to code

# EXAMPLES:

"""
base_prompt = build_examples_prompt(examples_prompt, examples_df, limit=13)

responses = []
for i, row  in tqdm_notebook(eval_df.iterrows(), total=eval_df.shape[0], desc="Processing records"):
    prompt = base_prompt
    prompt += build_example_prompt(text=row['text'])
    
    response = openai.Completion.create(engine=MODEL_NAME, prompt=prompt, max_tokens=1000)
    responses.append(response)

Processing records:   0%|          | 0/82 [00:00<?, ?it/s]

In [191]:
responses_df = pd.DataFrame(responses)
responses_df.to_csv(f'../build/openai-{MODEL_NAME}-{file_name}', index=False, compression='gzip')

In [205]:
responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)

0     walmart_availability = Store.check_availabilit...
1     date_time = DateTime.resolve_from_text("tomorr...
2     date_time = DateTime.resolve_from_text("tomorr...
3     music_source = MusicSource.resolve_from_text("...
4     recipient = Recipient.resolve_from_text("Dad")...
                            ...                        
77    spotify_playlist_name = "lofi"\nMediaPlayer.pl...
78    date_time = DateTime.resolve_from_text("tonigh...
79    date_time = DateTime.resolve_from_text("tonigh...
80    date_time_start = DateTime.resolve_from_text("...
81    date_time_tomorrow = DateTime.resolve_from_tex...
Name: choices, Length: 82, dtype: object

#### Evaluating the results

In [212]:
eval_df.columns

Index(['index', 'test_id', 'sample_id', 'sample_minor_id', 'text', 'code',
       'test', 'imports', 'lang_rep', 'code_rep', 'generated_code'],
      dtype='object')

In [219]:
eval_df['generated_code'] = responses_df['choices'].apply(lambda choices: choices[0]['text'] if choices else None)
eval_df['test_code'] = eval_df.apply(lambda row: build_test_code(code=row['generated_code'], imports=row['imports'], test=row['test']), axis=1)
eval_df['results'] = eval_df['test_code'].apply(lambda code: eval_code(code))

scores_df = compute_scores(eval_df, index='sample_id')
scores_df.groupby('sample_id')['score'].mean().mean()

0.0625

### gpt-4

MODEL_NAME = 'text-gpt4'
model = openai.Model(MODEL_NAME)

In [271]:
df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},
                  index=['falcon', 'dog', 'spider', 'fish'])
df2 = pd.DataFrame({'num_legs': [2, 4, 8, 0],
                   'num_wings': [2, 0, 0, 0],
                   'num_specimen_seen': [10, 2, 1, 8]},)
df

Unnamed: 0,num_legs,num_wings,num_specimen_seen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [272]:
pd.concat([df, df2], axis=1)

Unnamed: 0,num_legs,num_wings,num_specimen_seen,num_legs.1,num_wings.1,num_specimen_seen.1
falcon,2.0,2.0,10.0,,,
dog,4.0,0.0,2.0,,,
spider,8.0,0.0,1.0,,,
fish,0.0,0.0,8.0,,,
0,,,,2.0,2.0,10.0
1,,,,4.0,0.0,2.0
2,,,,8.0,0.0,1.0
3,,,,0.0,0.0,8.0


In [None]:
import evaluate

module = evaluate.load("dvitel/codebleu")
src = 'class AcidicSwampOoze(MinionCard):§    def __init__(self):§        super().__init__("Acidic Swamp Ooze", 2, CHARACTER_CLASS.ALL, CARD_RARITY.COMMON, battlecry=Battlecry(Destroy(), WeaponSelector(EnemyPlayer())))§§    def create_minion(self, player):§        return Minion(3, 2)§'
tgt = 'class AcidSwampOoze(MinionCard):§    def __init__(self):§        super().__init__("Acidic Swamp Ooze", 2, CHARACTER_CLASS.ALL, CARD_RARITY.COMMON, battlecry=Battlecry(Destroy(), WeaponSelector(EnemyPlayer())))§§    def create_minion(self, player):§        return Minion(3, 2)§'
src = src.replace("§","\n")
tgt = tgt.replace("§","\n")
res = module.compute(predictions = [tgt], references = [[src]])
print(res)