# Preparation

In [2]:
import os
import openai
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import time
import ast

In [3]:
# Load the environment variables from the .env file
load_dotenv()
api_key = os.getenv("API_KEY")

# Set up OpenAI API credentials
openai.api_key = api_key

In [9]:
def load_data(batch_size):
    # Load the WN18RR test set from a the txt file available here https://github.com/villmow/datasets_knowledge_embedding/raw/master/WN18RR/text/test.txt
    wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)

    # Convert the test set to a list of triplets
    test_triplets = [(triplet[0], triplet[1], triplet[2]) for triplet in wn18rr_test_set.values]

    # Chunk the test triplets into batches of size 50
    # batch_size = 50
    num_batches = (len(test_triplets) + batch_size - 1) // batch_size
    triplet_batches = [
        test_triplets[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)
    ]
    return triplet_batches

# Wrong implementation in first step !

## First prompt format

In [None]:
triplet_batches = load_data(50)
# Initialize lists to store responses
responses = []
generated_entities = []
response_arrays = []

# Open files in write mode
file1 = open('results/responses.txt', 'w')
file2 = open('results/generated_entities.txt', 'w')

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    prompt = 'Consider the relation prediction task, in the following lines first the entity and second the relation are separated by space. Give me all the missing entities in a single array in your answer:'
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]} {triplet[1]}'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses.append(response)
    generated_entities.append(generated_entities)

    file1.write(str(response)+ '\n')
    file2.write(str(generated_entity)+ '\n')
    time.sleep(20)

# Close files
file1.close()
file2.close()

In [80]:
ge_file = open('results/generated_entities.txt', 'r')
results = []

for line in ge_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results.append(array)
    except: 
        results.append([])
len(results)

59

## Second prompt format

In [None]:
triplet_batches = load_data(50)
# Initialize lists to store responses
responses2 = []
generated_entities2 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses2_file = open('results/responses2.txt', 'a')
    generated_entities2_file = open('results/generated_entities2.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities (NOT THE RELATION OR OTHER FALSY WORDS!) respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]"

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses2.append(response)
    generated_entities2.append(generated_entities)

    responses2_file.write(str(response)+ '\n')
    generated_entities2_file.write(str(generated_entity)+ '\n')

    # Close files
    responses2_file.close()
    generated_entities2_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(19)

In [90]:
ge2_file = open('results/generated_entities2.txt', 'r')
results2 = []

for line in ge2_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results2.append(array)
    except: 
        results2.append([])
len(results2)

63

In [93]:
triplet_batches = load_data(50)
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(results2, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(answers))

('count: 38', 'rate: 0.6031746031746031')

In [94]:
correct = 0
for a_list, r_list in zip(answers, results2):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        if r == a_list[i]:
            correct += 1
correct

0

## Third prompt format

### 70 per prompt

In [None]:
triplet_batches = load_data(70)
# Initialize lists to store responses
responses3 = []
generated_entities3 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses3_file = open('results/responses3.txt', 'a')
    generated_entities3_file = open('results/generated_entities3.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]\nNotice: 1.DO NOT GIVE ME THE RELATION OR OTHER FALSY WORDS IN YOUR RESPONSE!\n2. THE LENGTH OF YOUR RESPONSE SHOULD BE: "
    prompt += str(len(triplet_batch))

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses3.append(response)
    generated_entities3.append(generated_entities)

    responses3_file.write(str(response)+ '\n')
    generated_entities3_file.write(str(generated_entity)+ '\n')

    # Close files
    responses3_file.close()
    generated_entities3_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

In [95]:
ge3_file = open('results/generated_entities3.txt', 'r')
results3 = []

for line in ge3_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results3.append(array)
    except: 
        results3.append([])
len(results3)

45

In [97]:
triplet_batches = load_data(70)

answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(results3, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(answers))

('count: 37', 'rate: 0.8222222222222222')

In [99]:
correct = 0
for a_list, r_list in zip(answers, results3):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        if r == a_list[i]:
            correct += 1
correct

0

### 50 per prompt

In [None]:
triplet_batches = load_data(50)
# Initialize lists to store responses
responses4 = []
generated_entities4 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses4_file = open('results/responses4.txt', 'a')
    generated_entities4_file = open('results/generated_entities4.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]\nNotice: 1.DO NOT GIVE ME THE RELATION OR OTHER FALSY WORDS IN YOUR RESPONSE!\n2. THE LENGTH OF YOUR RESPONSE SHOULD BE: "
    prompt += str(len(triplet_batch))

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses4.append(response)
    generated_entities4.append(generated_entities)

    responses4_file.write(str(response)+ '\n')
    generated_entities4_file.write(str(generated_entity)+ '\n')

    # Close files
    responses4_file.close()
    generated_entities4_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

In [101]:
ge4_file = open('results/generated_entities4.txt', 'r')
results4 = []

for line in ge4_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results4.append(array)
    except: 
        results4.append([])
len(results4)

63

In [102]:
triplet_batches = load_data(50)
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(results4, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(results4))

('count: 45', 'rate: 0.7142857142857143')

In [103]:
correct = 0
for a_list, r_list in zip(answers, results4):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        if r == a_list[i]:
            correct += 1
correct

7

## Few shot prompting

In [4]:
triplet_batches = load_data(50)
samples = []
for _ in range(3):
    samples.append(triplet_batches[-1].pop())
samples

[('piciform_bird.n.01', '_hypernym', 'bird.n.01'),
 ('grocery_store.n.01', '_has_part', 'shelf.n.01'),
 ('transmit.v.04', '_derivationally_related_form', 'channelization.n.01')]

In [None]:
# Initialize lists to store responses
responses5 = []
generated_entities5 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses5_file = open('results/responses5.txt', 'a')
    generated_entities5_file = open('results/generated_entities5.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]\nNotice: 1.DO NOT GIVE ME THE RELATION OR OTHER FALSY WORDS LIKE '' IN YOUR RESPONSE!\n2. THE LENGTH OF YOUR RESPONSE SHOULD BE: "
    prompt += str(len(triplet_batch))
    prompt += "\nI give you 3 samples, consider them:\n"
    for i, s in enumerate(samples):
        prompt += f'{str(i+1)}. entity: {s[0]}, relation: {s[1]}, missing entity(answer): {s[2]}\n'
    prompt += 'DO NOT give me these samples in your response!'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses5.append(response)
    generated_entities5.append(generated_entities)

    responses5_file.write(str(response)+ '\n')
    generated_entities5_file.write(str(generated_entity)+ '\n')

    # Close files
    responses5_file.close()
    generated_entities5_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

In [5]:
ge5_file = open('results/generated_entities5.txt', 'r')
results5 = []

for line in ge5_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results5.append(array)
    except: 
        results5.append([])
len(results5)

63

In [6]:
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(results5, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(results5))

('count: 48', 'rate: 0.7619047619047619')

In [8]:
correct = 0
for a_list, r_list in zip(answers, results5):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        if r == a_list[i]:
            correct += 1
correct

11

# Implement with 10 suggestion for each item

## Firs try

In [None]:
triplet_batches = load_data(20)
# Initialize lists to store responses
s_responses = []
s_generated_entities = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    s_responses_file = open('results/s_responses.txt', 'a')
    s_generated_entities_file = open('results/s_generated_entities.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = """
Consider the relation prediction task and I want you to behave as a relation prediction model according to the following rules:
1. Data is provided in this format: <target_word> <relation>\\n
2. You should  predict 10 sorted candidate words for each item
3. Your answer should be a nested array that the length of the array is the count of given data, and each array contains 10 sorted candidate words
4. 
5. The output MUST be a nested array with this format:
  [[<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], [<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], ...]
  Example: [['example1.n.01', 'example2.n.01', 'example3.n.01', 'example4.n.01', 'example5.n.01', 'example6.n.01', 'example7.n.01', 'example8.n.01', 'example9.n.01', 'example10.n.01']]
6. Don't generate code, JUST give me the nested list
7. Give output like this and no more explanation: result : <nested_list>

data:
"""
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]} {triplet[1]}'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=1500,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    s_responses.append(response)
    s_generated_entities.append(generated_entity)

    s_responses_file.write(str(response)+ '\n')
    s_generated_entities_file.write(str(generated_entity)+ '\n')

    # Close files
    s_responses_file.close()
    s_generated_entities_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

## Better prompting

In [15]:
triplet_batches = load_data(20)
# Initialize lists to store responses
s_responses2 = []
s_generated_entities2 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    s_responses_file2 = open('results/s_responses2.txt', 'a')
    s_generated_entities_file2 = open('results/s_generated_entities2.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = f"""
Consider the relation prediction task and I want you to behave as a relation prediction model according to the following rules:
1. Data is provided in this format: <target_word> <relation>\\n
2. You should  predict 10 sorted candidate words for each item
3. Your answer should be a nested array that the length of the array is the count of given data, and each array contains 10 sorted candidate words
4. 
5. The output MUST be a nested array with this format:
  [[<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], [<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], ...]
  Example: [['example1.n.01', 'example2.n.01', 'example3.n.01', 'example4.n.01', 'example5.n.01', 'example6.n.01', 'example7.n.01', 'example8.n.01', 'example9.n.01', 'example10.n.01']]
6. Don't generate code, JUST give me the nested list
7. Give output like this and no more explanation: result : <nested_list>
8. Don't use \\n or in your response
9. Don't give me the example i gave you in rule number 5, give me candidate words!
10. The out put should contains {len(triplet_batch)} arrays each contains 10 candidate, DO NOT make larger arrays and keep array format!

data:
"""
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]} {triplet[1]}'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=2000,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    s_responses2.append(response)
    s_generated_entities2.append(generated_entity)

    s_responses_file2.write(str(response)+ '\n')
    s_generated_entities_file2.write(str(generated_entity)+ '\n')

    # Close files
    s_responses_file2.close()
    s_generated_entities_file2.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

  0%|          | 0/157 [00:00<?, ?it/s]

100%|██████████| 157/157 [1:18:57<00:00, 30.18s/it]


### Load saved data as list type

In [45]:
s_ge2_file = open('results/s_generated_entities2.txt', 'r')
s_results2 = []

for line in s_generated_entities2:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.rfind("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            array_str = '[]'
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        s_results2.append(array)
    except: 
        s_results2.append([])
len(s_results2)

157

### Calculate count and rate of bad output of API

In [46]:
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(s_results2, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(s_results2))

('count: 33', 'rate: 0.21019108280254778')

### Calculate total reciprocal rank

In [47]:
total_reciprocal_rank = 0
for a_list, r_list in zip(answers, s_results2):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        try:
            true_rank = r.index(a_list[i]) + 1
            reciprocal_rank = 1 / true_rank
            # Add the reciprocal rank to the total
            total_reciprocal_rank += reciprocal_rank
        except:
            total_reciprocal_rank += 0
total_reciprocal_rank

159.9543650793651

### Get length of test set

In [40]:
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)
len(wn18rr_test_set.values)

3134

### Calculate MRR

In [41]:
total_reciprocal_rank/len(wn18rr_test_set.values)

0.0510384062154962

## Change request setting and few shot prompting

In [54]:
triplet_batches = load_data(30)
print(len(triplet_batches[-1]))
sample = triplet_batches[-1].pop()
sample, len(triplet_batches[-1])

14


(('piciform_bird.n.01', '_hypernym', 'bird.n.01'), 13)

In [55]:
# Initialize lists to store responses
s_responses3 = []
s_generated_entities3 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    s_responses_file3 = open('results/s_responses3.txt', 'a')
    s_generated_entities_file3 = open('results/s_generated_entities3.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = f"""
Consider the relation prediction task and I want you to behave as a relation prediction model according to the following rules:
1. Data is provided in this format: <target_word> <relation>\\n
2. You should  predict 10 sorted candidate words for each item
3. Your answer should be a nested array that the length of the array is the count of given data, and each array contains 10 sorted candidate words
4. 
5. The output MUST be a nested array with this format:
  [[<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], [<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], ...]
  Example: [['example1.n.01', 'example2.n.01', 'example3.n.01', 'example4.n.01', 'example5.n.01', 'example6.n.01', 'example7.n.01', 'example8.n.01', 'example9.n.01', 'example10.n.01']]
6. Don't generate code, JUST give me the nested list
7. Give output like this and no more explanation: result : <nested_list>
8. Don't use \\n or in your response
9. Don't give me the example i gave you in rule number 5, give me candidate words!
10. The out put should contains {len(triplet_batch)} arrays each contains 10 candidate, DO NOT make larger arrays and keep array format!
11. An example of <target_word> <relation> <missed_entity> would be {sample[0]} {sample[1]} {sample[1]}. NOTE: it is just an example, don't give me this example as response!

data:
"""
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]} {triplet[1]}'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=2500,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    s_responses3.append(response)
    s_generated_entities3.append(generated_entity)

    s_responses_file3.write(str(response)+ '\n')
    s_generated_entities_file3.write(str(generated_entity)+ '\n')

    # Close files
    s_responses_file3.close()
    s_generated_entities_file3.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(10)

  0%|          | 0/105 [00:00<?, ?it/s]

100%|██████████| 105/105 [51:21<00:00, 29.35s/it]


### Load saved data as list type

In [85]:
s_ge3_file = open('results/s_generated_entities3.txt', 'r')
s_results3 = []

for line in s_generated_entities3:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.rfind("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            array_str = '[]'
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        s_results3.append(array)
    except: 
        s_results3.append([])
len(s_results3)

105

### Calculate count and rate of bad output of API

In [60]:
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(s_results3, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(s_results3))

('count: 22', 'rate: 0.20952380952380953')

### Calculate total reciprocal rank

In [61]:
total_reciprocal_rank = 0
for a_list, r_list in zip(answers, s_results3):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        try:
            true_rank = r.index(a_list[i]) + 1
            reciprocal_rank = 1 / true_rank
            # Add the reciprocal rank to the total
            total_reciprocal_rank += reciprocal_rank
        except:
            total_reciprocal_rank += 0
total_reciprocal_rank

151.29047619047614

### Get length of test set

In [62]:
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)
len(wn18rr_test_set.values)

3134

### Calculate MRR

In [63]:
total_reciprocal_rank/len(wn18rr_test_set.values)

0.04827392348132615

## Repeat last exp. with 20 items per request (**THE BEST**)

In [69]:
triplet_batches = load_data(20)
print(len(triplet_batches[-1]))
sample = triplet_batches[-1].pop()
sample, len(triplet_batches[-1])

14


(('piciform_bird.n.01', '_hypernym', 'bird.n.01'), 13)

In [79]:
# Initialize lists to store responses
s_responses4 = []
s_generated_entities4 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches[40:]):
    # Open files in append mode
    s_responses_file4 = open('results/s_responses4.txt', 'a')
    s_generated_entities_file4 = open('results/s_generated_entities4.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = f"""
Consider the relation prediction task and I want you to behave as a relation prediction model according to the following rules:
1. Data is provided in this format: <target_word> <relation>\\n
2. You should  predict 10 sorted candidate words for each item
3. Your answer should be a nested array that the length of the array is the count of given data, and each array contains 10 sorted candidate words
4. 
5. The output MUST be a nested array with this format:
  [[<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], [<candidate1>, <candidate2>, <candidate3>, <candidate4>, <candidate5>, <candidate6>, <candidate7>, <candidate8>, <candidate9>, <candidate10>], ...]
  Example: [['example1.n.01', 'example2.n.01', 'example3.n.01', 'example4.n.01', 'example5.n.01', 'example6.n.01', 'example7.n.01', 'example8.n.01', 'example9.n.01', 'example10.n.01']]
6. Don't generate code, JUST give me the nested list
7. Give output like this and no more explanation: result : <nested_list>
8. Don't use \\n or in your response
9. Don't give me the example i gave you in rule number 5, give me candidate words!
10. The out put should contains {len(triplet_batch)} arrays each contains 10 candidate, DO NOT make larger arrays and keep array format!
11. An example of <target_word> <relation> <missed_entity> would be {sample[0]} {sample[1]} {sample[1]}. NOTE: it is just an example, don't give me this example as response!

data:
"""
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]} {triplet[1]}'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=2500,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    s_responses4.append(response)
    s_generated_entities4.append(generated_entity)

    s_responses_file4.write(str(response)+ '\n')
    s_generated_entities_file4.write(str(generated_entity)+ '\n')

    # Close files
    s_responses_file4.close()
    s_generated_entities_file4.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(10)

100%|██████████| 117/117 [48:03<00:00, 24.64s/it]


### Load saved data as list type

In [84]:
s_ge4_file = open('results/s_generated_entities4.txt', 'r')
s_results4 = []

for line in s_ge4_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.rfind("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            array_str = '[]'
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        s_results4.append(array)
    except: 
        s_results4.append([])
len(s_results4)

157

### Calculate count and rate of bad output of API

In [86]:
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(s_results4, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(s_results4))

('count: 11', 'rate: 0.07006369426751592')

### Calculate total reciprocal rank

In [87]:
total_reciprocal_rank = 0
for a_list, r_list in zip(answers, s_results4):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        try:
            true_rank = r.index(a_list[i]) + 1
            reciprocal_rank = 1 / true_rank
            # Add the reciprocal rank to the total
            total_reciprocal_rank += reciprocal_rank
        except:
            total_reciprocal_rank += 0
total_reciprocal_rank

183.57420634920635

### Get length of test set

In [88]:
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)
len(wn18rr_test_set.values)

3134

### Calculate MRR

In [89]:
total_reciprocal_rank/len(wn18rr_test_set.values)

0.058575049888068394