In [None]:
import openai
import tiktoken
import os
import pandas as pd
import numpy as np
import time

Use tiktoken.get_encoding() to load an encoding by name.

The first time this runs, it will require an internet connection to download. Later runs won't need an internet connection.

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")

Use tiktoken.encoding_for_model() to automatically load the correct encoding for a given model name.

In [None]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [None]:
with open("openai_token.txt", 'r') as fp:
    openai_token = fp.readline()
# Set your OpenAI API key here
openai.api_key = openai_token
# Set the environment variable
#os.environ["OPENAI_API_KEY"] = api_key

## Prompt preparation

In [None]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
def remove_non_ascii(sentence):
    return ''.join(char for char in sentence if ord(char) < 128)

In [None]:
with open('../data/prompt/prompt_system_etd_V3.txt', 'r') as fp:
    etd_system = fp.readlines()
    
with open('../data/prompt/prompt_system_ps_V3.txt', 'r') as fp:
    ps_system = fp.readlines()

In [None]:
" ".join(etd_system)

In [None]:
total_user_etd_len = num_tokens_from_string(" ".join(etd_system), "cl100k_base")
total_user_etd_len

In [None]:
" ".join(ps_system)

In [None]:
total_user_ps_len = num_tokens_from_string(" ".join(ps_system), "cl100k_base")
total_user_ps_len

In [None]:
df_issue = pd.read_csv("../data/issue.csv", encoding = "ISO-8859-1")

In [None]:
len(df_issue)

In [None]:
issue_clean = [remove_non_ascii(issue) for issue in df_issue["issue"]]
df_issue["issue"] = issue_clean
df_issue.head()

### for random example

In [None]:
# choose either run ETD or PS
# df_etd_example = pd.read_csv("../data/prompt/example/etd_example_V3.csv")
df_ps_example = pd.read_csv("../data/prompt/example/ps_example_V3.csv")

In [None]:
df_etd_randexample = df_etd_example[df_etd_example["5_1examples"]==1]

chat_issue = list(df_etd_randexample["issue"].values)
chat_issue

In [None]:
chat_output = list(df_etd_randexample["output"].values)
chat_output

In [None]:
df_ps_randexample = df_ps_example[df_ps_example["5_1examples"]==1]

chat_issue = list(df_ps_randexample["issue"].values)
chat_issue

In [None]:
chat_output = list(df_ps_randexample["output"].values)
chat_output

## Model running

In [None]:
for user_string,output_string in zip(chat_issue,chat_output):
    total_user_etd_len += num_tokens_from_string(user_string,"cl100k_base")
    total_user_etd_len += num_tokens_from_string(output_string,"cl100k_base")
    
print("ETD total input token:",total_user_etd_len)

In [None]:
for user_string,output_string in zip(chat_issue,chat_output):
    total_user_ps_len += num_tokens_from_string(user_string,"cl100k_base")
    total_user_ps_len += num_tokens_from_string(output_string,"cl100k_base")
    
print("PS total input token:",total_user_ps_len)

In [None]:
ans1= []
start_i = 0
flag = True
while flag:
    try:
        for i in range(start_i,len(df_issue),1):
            issue = df_issue.loc[i]["issue"]
            input_str = "\"\"\" {} \"\"\"".format(issue)
    
            result = openai.ChatCompletion.create(
                model="gpt-4o",
#                 model="gpt-3.5-turbo",
                messages=[
#             {"role": "system", "content": " ".join(etd_system)},
            {"role": "system", "content": " ".join(ps_system)},
            {"role": "user", "content": chat_issue[0]},
            {"role": "assistant", "content": chat_output[0]},
            {"role": "user", "content": chat_issue[1]},
            {"role": "assistant", "content": chat_output[1]},
            {"role": "user", "content": chat_issue[2]},
            {"role": "assistant", "content": chat_output[2]},
            {"role": "user", "content": chat_issue[3]},
            {"role": "assistant", "content": chat_output[3]},
            {"role": "user", "content": chat_issue[4]},
            {"role": "assistant", "content": chat_output[4]},
            {"role": "user", "content": chat_issue[5]},
            {"role": "assistant", "content": chat_output[5]},
            {"role": "user", "content": chat_issue[6]},
            {"role": "assistant", "content": chat_output[6]},
            {"role": "user", "content": chat_issue[7]},
            {"role": "assistant", "content": chat_output[7]},
            {"role": "user", "content": chat_issue[8]},
            {"role": "assistant", "content": chat_output[8]},
            {"role": "user", "content": chat_issue[9]},
            {"role": "assistant", "content": chat_output[9]},
#             {"role": "user", "content": chat_issue[10]},
#             {"role": "assistant", "content": chat_output[10]},
#             {"role": "user", "content": chat_issue[11]},
#             {"role": "assistant", "content": chat_output[11]},
#             {"role": "user", "content": chat_issue[12]},
#             {"role": "assistant", "content": chat_output[12]},
#             {"role": "user", "content": chat_issue[13]},
#             {"role": "assistant", "content": chat_output[13]},
#             {"role": "user", "content": chat_issue[14]},
#             {"role": "assistant", "content": chat_output[14]},
            {"role": "user", "content": input_str},
                ],
                max_tokens = 300,
                temperature = 0,
#                 seed = 30
        )
    
            ans1.append(result['choices'][0]['message']['content'])
        
        flag = False
    except openai.error.RateLimitError as e:
        print(f"RateLimitError: {e}")
        time.sleep(60)
        start_i = len(ans1)+0
    except Exception as e:
        print(f"An error occurred: {e}")
        break  # Exit the loop on other errors

In [None]:
print(len(ans1))

In [None]:
df = pd.DataFrame()
df["answer1"] = ans1

df.head(10)

In [None]:
df.to_csv("../experiment/OpenAI/generated_output.csv",index = None)

## Test Evaluation

In [None]:
from sklearn import metrics as mt

In [None]:
# ETD results

df_test_random = pd.read_csv("../experiment/OpenAI/ETD/etd_random_V3_0example_gpt4o_result(t=0).csv")
predictions_etd = df_test_random["y''_ETD"]
y_test = df_test_random["y_ETD"]
precison_etd = mt.precision_score(y_test, predictions_etd)
recall_etd = mt.recall_score(y_test, predictions_etd)
score_etd = mt.f1_score(y_test, predictions_etd)

print("precision:",round(precison_etd,3),"recall:",round(recall_etd,3),"F1:",round(score_etd,3))

In [None]:
# PS results 

df_test_random = pd.read_csv("../experiment/OpenAI/PS/ps_random_avg_V3_10example_gpt4o_result(t=0).csv")
predictions_ps = df_test_random["y''_PS"]
y_test = df_test_random["y_PS"]
precison_ps = mt.precision_score(y_test, predictions_ps)
recall_ps = mt.recall_score(y_test, predictions_ps)
score_ps = mt.f1_score(y_test, predictions_ps)

print("precision:",round(precison_ps,3),"recall:",round(recall_ps,3),"F1:",round(score_ps,3))