In [2]:
import openai

import pandas as pd
import tiktoken

from tqdm import tqdm

import sacrebleu
from rouge_score import rouge_scorer

import os
# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv()) # read local .env file

# openai.api_key  = os.getenv('OPENAI_API_KEY')
openai.api_key  = ""

In [3]:
client = openai.AsyncOpenAI(
  api_key=openai.api_key,  # this is also the default, it can be omitted
)


async def get_completion(prompt_, model_="gpt-4"):
    messages_ = [{"role": "user", "content": prompt_}]
    response_ = await client.chat.completions.create(
        model=model_, 
        messages=messages_,
        temperature=0.00000001,
    )
    return response_.choices[0].message.content

# Set up prompt

In [5]:
str_term = "(λx.((λy.((λz.z) x)) (λa.a)))"
next_step_term = "(λx.((λy.y) (λa.a)))"

prompt = f"""
Please generate the next step of reduction a Lambda Calculus term. Provide only term expression.

Lambda term: '''{str_term}'''
"""
response = await get_completion(prompt)
print(f"expected output: {next_step_term}")
print(f"model output: {response}")

expected output: (λx.((λy.y) (λa.a)))
model output: (λx.(λa.a) x)


# Load the data

In [6]:
df = pd.read_csv("./data/term_step_LO.csv", delimiter=",")
df.head()

Unnamed: 0,term,term_next_LO
0,((λx.(λy.((y y) (λz.z)))) (λa.(((λb.b) (a (λc....,(λx.((x x) (λy.y)))
1,(((λx.(λy.((y y) (λz.((λa.(λb.(λc.(x (c (a c))...,((λx.((x x) (λy.((λz.(λa.(λb.((λc.(((λd.(c d))...
2,((λx.((x x) (λy.((λz.(λa.(λb.((λc.(((λd.(c d))...,((((λx.x) ((λy.y) (λz.(z (λa.z))))) ((λx.x) ((...
3,((((λx.x) ((λy.y) (λz.(z (λa.z))))) ((λx.x) ((...,((((λx.x) (λy.(y (λz.y)))) ((λa.a) ((λb.b) (λc...
4,((((λx.x) (λy.(y (λz.y)))) ((λa.a) ((λb.b) (λc...,(((λx.(x (λy.x))) ((λz.z) ((λa.a) (λb.(b (λc.b...


In [7]:
enc_tiktoken = tiktoken.encoding_for_model("gpt-4")
total_tokens = 0
for term in df["term"].tolist():
    total_tokens += len(enc_tiktoken.encode(term))

print(f"Total term tokens: {total_tokens}")

total_tokens = 0
for term in df["term_next_LO"].tolist():
    total_tokens += len(enc_tiktoken.encode(term))
    
print(f"Total expected tokens: {total_tokens}")

Total term tokens: 523365
Total expected tokens: 502995


# Reduce size of expected and input terms tokens

In [23]:
df_reduced = df[[len(enc_tiktoken.encode(term)) < 40 for term in df["term"].tolist()]]

In [24]:
len(df_reduced)

305

In [25]:
total_tokens = 0
for term in df_reduced["term"].tolist():
    total_tokens += len(enc_tiktoken.encode(term))

print(f"Total term tokens: {total_tokens}")

total_tokens = 0
for term in df_reduced["term_next_LO"].tolist():
    total_tokens += len(enc_tiktoken.encode(term))
    
print(f"Total expected tokens: {total_tokens}")

Total term tokens: 8219
Total expected tokens: 6418


# Normalize terms with gpt-4-turbo model

In [26]:
term_answers = []

for str_term in tqdm(df_reduced["term"].tolist()):
    prompt = f"""
    Please generate the next step of reduction a lambda term. Provide only term expression.
    
    Lambda term: '''{str_term}'''
    """
    
    response = await get_completion(prompt)
    term_answers.append(response)

100%|██████████| 305/305 [08:52<00:00,  1.75s/it]


In [27]:
term_answers

['((λy.(λz.(λa.a))) (λb.b)) ((λc.c) (λd.d))',
 '((λy.(λz.z)) (λa.a))',
 '(λy.y)',
 '(λy.(y (y y)))',
 "'''(λy.(y ((((λz.λz) y) y) ((λz.λz λz.λz) (y (((λb.b) y) y))))))'''",
 "'''(λy.(y ((((λz.λz) y) y) ((λz.λz λz.λz) (y ((λa.a) y))))))'''",
 '((λy.(λz.y)) (λc.(λd.(λe.(λj.e))))',
 '(λy.(λz.(λa.(λb.(λc.b)))))',
 '(λx.(λy.(λz.((z (z ((λa.(z (z z))) (z z)))) z)))',
 '((λy.y) (λa.(λb.((a a) ((b (λc.(a (λd.d)))) b)))))',
 '(λy.(λz.((y y) ((z (λa.(y (λb.b)))) z))))',
 "'''(λx.(((x x) (λy.((x ((λz.z) y)) (x (λb.y))))) (x x)))'''",
 "'''(λx.(((x x) (λy.((x ((λz.z) y)) (x (λa.y))))) (x x)))'''",
 '((((λy.y) (λz.(z (λa.a)))) (λb.b))',
 '((λy.(y (λz.z))) (λa.a))',
 '(λz.z (λy.y))',
 '(λy.y)',
 "'''(λy.(λz.(λa.(λb.(λc.(λd.(λe.(λj.(λi.(λn.e)))))))))'''",
 '((((λy.y) (λz.(λa.a))) (λb.(λc.((λd.d) (λe.e))))))',
 '((λy.(λz.z)) (λa.(λb.((λc.c) (λd.d)))))',
 '(λy.y)',
 "'''(λy.(λz.z)) ((λa.(((a (λb.a)) a) ((λc.a) a))) (λd.(λe.e)))'''",
 '((λy.(λz.z)) (λa.a))',
 "'''(λx.(x (x (x (λy.(((y ((x x) ((λz.(x z))

In [28]:
df_reduced = df_reduced.reset_index(drop=True)

In [29]:
df_reduced["gpt4_answers"] = term_answers

In [30]:
df_reduced.head()

Unnamed: 0,term,term_next_LO,gpt4_answers
0,(((λx.((λy.(λz.(λa.a))) x)) (λb.b)) ((λc.c) (λ...,(((λx.(λy.(λz.z))) (λa.a)) ((λb.b) (λc.c))),((λy.(λz.(λa.a))) (λb.b)) ((λc.c) (λd.d))
1,(((λx.x) (λy.(λz.z))) (λa.a)),((λx.(λy.y)) (λz.z)),((λy.(λz.z)) (λa.a))
2,((λx.(λy.y)) (λz.z)),(λx.x),(λy.y)
3,((λx.x) (λy.(y (y y)))),(λx.(x (x x))),(λy.(y (y y)))
4,(λx.(λy.(y ((((x (λz.x)) y) y) ((x x) (y (((λa...,(λx.(λy.(y ((((x (λz.x)) y) y) ((x x) (y ((λa....,'''(λy.(y ((((λz.λz) y) y) ((λz.λz λz.λz) (y (...


In [31]:
df_reduced.to_csv("./data/gpt4_answers.csv", index=False)

# Calculate accuracy

In [33]:
true_count = 0

for next_term, next_pred_term in zip(df_reduced["term_next_LO"], df_reduced["gpt4_answers"]):
    if next_term == next_pred_term:
        true_count += 1
        
print("pure accuracy: ", true_count * 100 / len(df_reduced), "%")

pure accuracy:  3.278688524590164 %


# Calculate ROUGE and BLEU

In [35]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

scores = []
for ref_term, sys_term in zip(df_reduced["term_next_LO"].tolist(), df_reduced["gpt4_answers"].tolist()):
    score = scorer.score(ref_term, sys_term)
    scores.append(score)

In [36]:
scores

[{'rouge1': Score(precision=0.8, recall=0.8, fmeasure=0.8000000000000002),
  'rouge2': Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778),
  'rougeL': Score(precision=0.8, recall=0.8, fmeasure=0.8000000000000002)},
 {'rouge1': Score(precision=0.6, recall=0.6, fmeasure=0.6),
  'rouge2': Score(precision=0.5, recall=0.5, fmeasure=0.5),
  'rougeL': Score(precision=0.6, recall=0.6, fmeasure=0.6)},
 {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)},
 {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0),
  'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)},
 {'rouge1': Score(precision=0.4666666666666667, recall=0.5, fmeasure=0.4827586206896552),
  'rouge2': Score(precision=0.14285714285714285, recall=0.15384615384615385, fmeasure=0.14814814814814817),
  '

In [37]:
avg_scores = {
    "metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L"],
    "precision": [0.0, 0.0, 0.0], 
    "recall": [0.0, 0.0, 0.0],
    "fmeasure": [0.0, 0.0, 0.0], 
}

for score in scores:
    avg_scores["precision"][0] += score["rouge1"].precision
    avg_scores["precision"][1] += score["rouge2"].precision
    avg_scores["precision"][2] += score["rougeL"].precision
    
    avg_scores["recall"][0] += score["rouge1"].recall
    avg_scores["recall"][1] += score["rouge2"].recall
    avg_scores["recall"][2] += score["rougeL"].recall
    
    avg_scores["fmeasure"][0] += score["rouge1"].fmeasure
    avg_scores["fmeasure"][1] += score["rouge2"].fmeasure
    avg_scores["fmeasure"][2] += score["rougeL"].fmeasure
    
for key_ in ("precision", "recall", "fmeasure"):
    avg_scores[key_][0] /= len(scores)
    avg_scores[key_][1] /= len(scores)
    avg_scores[key_][2] /= len(scores)
    
pd.DataFrame(avg_scores).head()

Unnamed: 0,metric,precision,recall,fmeasure
0,ROUGE-1,0.608902,0.640599,0.612309
1,ROUGE-2,0.412818,0.438391,0.414213
2,ROUGE-L,0.581954,0.610755,0.584519


In [38]:
import sacrebleu

In [40]:
references = [[ref_term, sys_term] for ref_term, sys_term in zip(df_reduced["term_next_LO"].tolist(), df_reduced["gpt4_answers"].tolist())]
candidate = df_reduced["term_next_LO"].tolist()

bleu = sacrebleu.corpus_bleu(candidate, references)
print("BLEU Score:", bleu.score)

BLEU Score: 100.00000000000004


# Compare actual terms