In [1]:
from openai import OpenAI, AsyncOpenAI
import asyncio
import os
from utils import get_dataset,get_ans_words_chard
from datasets import load_dataset
from tqdm import tqdm   
import emoji
import json


client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))



In [2]:
# eval_dataset_path = 'data/clue_json/guardian/naive_random/test.json'
# train_dataset_path = 'data/clue_json/guardian/naive_random/train.json'

# eval_dataset = load_dataset('json', data_files=eval_dataset_path, split='train')

# train_dataset = load_dataset('json', data_files=train_dataset_path, split='train')

In [3]:
def process_outputs(output, correct_answer):

    cleaned_length_error = True
    original_length_error = True
    original_correct = False
    cleaned_correct =False

    output = output.strip()
    correct_answer = correct_answer.strip()

    if output== correct_answer:
        original_correct = True
    if len(output) == len(correct_answer):
        original_length_error = False



    original_words = output.lower().split(' ')

    answer_lengths  =  [len(x) for x in  correct_answer.split(' ')]
    answer = []
    cleaned_answer = output
    if len(original_words) >= len(answer_lengths):
        for idx, length in enumerate(answer_lengths):
            answer.append(original_words[idx][:length])


        cleaned_answer = ' '.join(answer).strip()
        if cleaned_answer == correct_answer.strip():
            cleaned_correct = True
        if len(cleaned_answer) == len(correct_answer.strip()):
            cleaned_length_error = False
    

    return {'original_length_error': original_length_error,
            'cleaned_length_error': cleaned_length_error,
            'original_correct': original_correct,
            'cleaned_correct': cleaned_correct,
            'cleaned_answer': cleaned_answer}

In [4]:
def create_prompt(clue, shots):

    n_words, n_chars = get_ans_words_chard(clue)

    p = f'''The next line is a clue for a cryptic crossword. The clue consists of a definition part and a wordplay part. The answer consists of {n_words} words, and the number of characters in the answer is {n_chars}. Output only the answer.\n\n'''
    for shot in shots:
        c = shot["clue"]
        ans = shot["labels"]
        p += f'clue:\n{c}\n\n'
        p += f'answer:\n{ans}\n\n'

    p += f'clue:\n{clue}\n'
    p += f'answer:\n'

    return p

In [5]:
def save_results(temp, file_name):

    ## Initialize the file if it does not exist
    if not os.path.exists(file_name):
        with open(file_name,'w') as file:
            json.dump([],file)
    
    file_data = []
    with open(file_name,'r') as file:
        file_data = json.load(file)

    file_data.extend(temp)
    with open(file_name,'w') as file:
        # file.seek(0)
        json.dump(file_data,file)
        

In [6]:
from utils import get_dataset,generate_prompt

import numpy as np

errors = 0

model_name = 'gpt-3.5-turbo'
chatgpt_outputs_file = f'outputs/new_experiments/chatgpt_outputs/{model_name}_3_shot_random_extensive_prompt.json'

shots = 3
prompt_head='''The next line is a clue for a cryptic crossword. The clue consists of a definition part and a wordplay part. The answer consists of {n_words} words, and the number of characters in the answer is {n_chars}. Output only the answer.'''
dataset_path='data/clue_json/guardian/naive_random/test.json'

dataset = load_dataset('json', data_files=dataset_path, split='train')
dataset = dataset.remove_columns(['idx'])
dataset = dataset.rename_column('target', 'labels')
dataset = dataset.rename_column('input', 'clue')
# dataset = get_dataset(dataset_path=dataset_path,
#                         split='test',prompt_head = prompt_head,
#                         shots=shots,
#                         dataset_type='old',
#                         indicator_type_shots = 1,
#                         indicators_dict_path='data/indicators_examples.json',)

In [7]:
import random



num_examples = len(dataset)
save_temps = []

offset = 600
# with open(chatgpt_outputs_file, 'a') as f:
for idx ,clue in enumerate(tqdm(dataset.select(range(offset,num_examples)))):
    
    idx = idx + offset

    chosen= np.random.randint(0,len(dataset),shots)
    current_shots = dataset.select(chosen)

    # clue['prompt'] = clue['prompt'] + '\n\n### Response:\n'
    
    try:
      # correct_answers.append(clue["target"])
      clue_message = {"role": "user", "content": create_prompt(clue['clue'], current_shots)}#clue['prompt']}
      completion = client.chat.completions.create(
        # request_timeout=15,
        model=model_name,

        messages=[
          # system_message,
          clue_message
        ]
      )



      response = completion.choices[0].message.content.lower()
      save_temps.append({'idx': idx, 'clue': clue['clue'],'response': response, 'target': clue["labels"]})
    except:
      save_temps.append({'idx': idx})
      errors += 1

    if idx % 100 == 0 or idx == num_examples - 1:
      save_results(save_temps,chatgpt_outputs_file)
      save_temps = []

        

    


100%|██████████| 27876/27876 [84:04:33<00:00, 10.86s/it]     


In [15]:
for x in dataset:
    print(x)
    break

{'clue': 'Achy shaking stopped by iodine, salt and kaolin (5,4)', 'labels': 'china clay', 'prompt': '### Instruction: The next line is a clue for a cryptic crossword. The clue consists of a definition part and a wordplay part. The answer consists of 2 words, and the number of characters in the answer is 5,4. Output only the answer.\n\n### Input:\nAchy shaking stopped by iodine, salt and kaolin (5,4)'}


In [16]:
print(x['prompt'])

### Instruction: The next line is a clue for a cryptic crossword. The clue consists of a definition part and a wordplay part. The answer consists of 2 words, and the number of characters in the answer is 5,4. Output only the answer.

### Input:
Achy shaking stopped by iodine, salt and kaolin (5,4)


In [29]:
# with open('chatgpt_outputs/gpt-3.5-turbo_3shot_learning_outputs.json') as f:
#     d = json.load(f)
#     for i in d:
#         print(i)

{'idx': 0, 'response': 'a little', 'target': 'a trifle'}
{'idx': 1, 'response': 'marseille', 'target': 'jerusalem'}
{'idx': 2, 'response': 'tower', 'target': 'tower'}
{'idx': 3, 'response': 'sad', 'target': 'down'}
{'idx': 4, 'response': 'greenpeace', 'target': 'greenpeace'}


In [2]:
import json
with open('outputs/new_experiments/chatgpt_outputs/gpt-3.5-turbo_3shot_learning_outputs_disjoint.json') as f:
    d = json.load(f)


chatgpt_outputs = []
correct_answers = []
errors = 0
for i in d:
    if 'response' in i:
        chatgpt_outputs.append(i['response'])
        correct_answers.append(i['target'])
    else:
        errors += 1

assert len(chatgpt_outputs) == len(correct_answers)


In [18]:
# chatgpt_outputs = []
# correct_answers = []

# with open(f'chatgpt_outputs/gpt-3.5-turbo_outputs.txt', 'r') as f:
#   lines = f.readlines()

# cleaned_lines = []
# for i,l in enumerate(lines):
#   if not l[0].isdigit():
#     cleaned_lines[-1] = cleaned_lines[-1].strip() + l.strip()
#   else:
#     cleaned_lines.append(l.strip())


# print(len(cleaned_lines))
# for l in cleaned_lines:

#   ll = l.strip().split('|')
#   if len(ll) != 3:
#     print(l)

#   ## if this line doesn't has 3 components, it means that the response is empty, so we ignore it
#   if len(ll) < 3:
#     continue

#     idx, response, target = ll
#   splitted = l.strip().split('|')
#   idx = splitted[0]
#   target = splitted[-1]
#   response = ' '.join(splitted[1:-1])
  
#   chatgpt_outputs.append(response)
#   correct_answers.append(target)


# assert len(chatgpt_outputs) == len(correct_answers)

32636
818|stoppi|ng + peace|niks - iks = stopping|cinderella
17119|to secure the first course baked in pastry, you need to take the following steps:
1. the first course refers to the letter at the beginning of the word.
2. "baked in pastry" indicates that the rest of the word is formed by rearranging the letters of "pastry".thus, the answer to this clue is "april fool".|en croute
19520|sad | which refers to the girlfriend being sad and sand | which refers to the sandwiches being cut.|shed a tear
19542|there are two possible answers for this clue:
1) tyndale - tyndale is a bible translator known for his work in translating the bible into english. it can be split as "ty" (the first person in french) + "nd" (the first person in latin) + "ale" (a city in france). the answer has a total of 6 characters.
2) jerome - jerome is a bible translator known for his work in translating the bible into latin. it can be split as "jero" (the first person in french) + "me" (a city in france). the answer 

In [12]:
#################33 Evaluation ##################3

assert len(chatgpt_outputs) == len(correct_answers)

num_examples = len(chatgpt_outputs)

original_correct = 0
cleaned_correct = 0
original_length_error = 0
cleaned_length_error = 0


save_file = 'gpt-3.5-turbo_0-shot_random_learning_outputs.txt'
dataset_path='naive_random'

with open(save_file, 'w') as f:


    for idx, (output, correct_answer) in enumerate(zip(chatgpt_outputs, correct_answers)):
        results = process_outputs(output, correct_answer)

        original_correct += results['original_correct']
        cleaned_correct += results['cleaned_correct']
        original_length_error += results['original_length_error']
        cleaned_length_error += results['cleaned_length_error']
        cleaned_answer = results['cleaned_answer']

        f.write(f'Original output: {output}\n')
        if results['cleaned_correct'] :
            f.write(emoji.emojize(f'{cleaned_answer} | {correct_answer}  :check_mark_button: \n'))
        else:
            f.write(emoji.emojize(f'{cleaned_answer} | {correct_answer}  :cross_mark: \n'))

        f.write('----------------------------------------------------- \n\n')

    f.write('\n\n')
    f.flush()


    f.seek(0)
    f.write(f'Dataset: {dataset_path}\n')

    f.write(f'Number of Examples {num_examples}\n')
    print(f'Number of Examples {num_examples}\n')

    f.write(f' Cleaned ACCURACY:  { float (cleaned_correct / num_examples)}\n')
    print(f' Cleaned ACCURACY:  { float (cleaned_correct / num_examples)}\n')

    f.write(f'Orginal ACCURACY:  { float (original_correct / num_examples)}\n')
    print(f'Orginal ACCURACY:  { float (original_correct / num_examples)}\n')

    f.write(f'Length error:  { float ((cleaned_length_error / num_examples) )}\n')
    print(f'Length error:  { float ((cleaned_length_error / num_examples) )}\n')

    f.write(f'Original Length error:  { float ((original_length_error / num_examples) )}\n')
    print(f'Original Length error:  { float ((original_length_error / num_examples) )}\n')
    

Number of Examples 16202

 Cleaned ACCURACY:  0.02950253055178373

Orginal ACCURACY:  0.028514998148376745

Length error:  0.26731267744722875

Original Length error:  0.42254042710776446



In [None]:
# print(correct_answers)
# print(chatgpt_outputs)

for i,j in zip(chatgpt_outputs, correct_answers):
    print(i,j)
    print(len(i), len(j))

musical a trifle
7 8
marseille jerusalem
9 9
magnet tower
6 5
soap down
4 4
conserving greenpeace
10 10
eye ooh
3 3
sagaal lissome
6 7
silhouette televise
10 8
safeguard (9) patroller
13 9
city wall holy city
9 9


# 100 samples results: 

Original correct: 0.06
Cleaned correct: 0.06
Original length error: 0.33
Cleaned length error: 0.17