# Preparation

In [2]:
import os
import openai
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import time
import ast

In [3]:
# Load the environment variables from the .env file
load_dotenv()
api_key = os.getenv("API_KEY")

# Set up OpenAI API credentials
openai.api_key = api_key

In [None]:
# Load the WN18RR test set from a the txt file available here https://github.com/villmow/datasets_knowledge_embedding/raw/master/WN18RR/text/test.txt
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)

# Convert the test set to a list of triplets
test_triplets = [(triplet[0], triplet[1], triplet[2]) for triplet in wn18rr_test_set.values]

# Chunk the test triplets into batches of size 50
batch_size = 50
num_batches = (len(test_triplets) + batch_size - 1) // batch_size
triplet_batches = [
    test_triplets[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)
]

# Wrong implementation in first step !

## First prompt format

In [None]:
# Initialize lists to store responses
responses = []
generated_entities = []
response_arrays = []

# Open files in write mode
file1 = open('results/responses.txt', 'w')
file2 = open('results/generated_entities.txt', 'w')

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    prompt = 'Consider the relation prediction task, in the following lines first the entity and second the relation are separated by space. Give me all the missing entities in a single array in your answer:'
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]} {triplet[1]}'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses.append(response)
    generated_entities.append(generated_entities)

    file1.write(str(response)+ '\n')
    file2.write(str(generated_entity)+ '\n')
    time.sleep(20)

# Close files
file1.close()
file2.close()

In [None]:
ge_file = open('results/generated_entities.txt', 'r')
results = []

for line in ge_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results.append(array)
    except: 
        results.append([])
len(results)

## Second prompt format

In [None]:
# Initialize lists to store responses
responses2 = []
generated_entities2 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses2_file = open('results/responses2.txt', 'a')
    generated_entities2_file = open('results/generated_entities2.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities (NOT THE RELATION OR OTHER FALSY WORDS!) respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]"

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses2.append(response)
    generated_entities2.append(generated_entities)

    responses2_file.write(str(response)+ '\n')
    generated_entities2_file.write(str(generated_entity)+ '\n')

    # Close files
    responses2_file.close()
    generated_entities2_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(19)

In [None]:
generated_entities2

### Process result

In [None]:
results = []
for ge in generated_entities2:
    # Extract the array from the text
    start_index = ge.find("[")
    end_index = ge.find("]")
    array_str = ge[start_index:end_index+1]
    if end_index == -1:
        array_str = '[]'
    # Convert the string representation of the array to a Python list
    array = ast.literal_eval(array_str)

    # Append the extracted array
    results.append(array)

In [None]:
len(results), len(triplet_batches)

In [None]:
answers = [[l for j,k,l in i] for i in triplet_batches]
len(answers[-1]), len(results[-1])

In [None]:
unbalance_count = 0
for i, (r, a) in enumerate(zip(results, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(results))

## Third prompt format

### 70 per prompt

In [None]:
# Load the WN18RR test set from a the txt file available here https://github.com/villmow/datasets_knowledge_embedding/raw/master/WN18RR/text/test.txt
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)

# Convert the test set to a list of triplets
test_triplets = [(triplet[0], triplet[1], triplet[2]) for triplet in wn18rr_test_set.values]

# Chunk the test triplets into batches of size 50
batch_size = 70
num_batches = (len(test_triplets) + batch_size - 1) // batch_size
triplet_batches = [
    test_triplets[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)
]

In [None]:
# Initialize lists to store responses
responses3 = []
generated_entities3 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses3_file = open('results/responses3.txt', 'a')
    generated_entities3_file = open('results/generated_entities3.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]\nNotice: 1.DO NOT GIVE ME THE RELATION OR OTHER FALSY WORDS IN YOUR RESPONSE!\n2. THE LENGTH OF YOUR RESPONSE SHOULD BE: "
    prompt += str(len(triplet_batch))

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses3.append(response)
    generated_entities3.append(generated_entities)

    responses3_file.write(str(response)+ '\n')
    generated_entities3_file.write(str(generated_entity)+ '\n')

    # Close files
    responses3_file.close()
    generated_entities3_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

In [None]:
ge3_file = open('results/generated_entities3.txt', 'r')
results3 = []

for line in ge3_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results3.append(array)
    except: 
        results3.append([])
len(results3)

In [None]:
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(results3, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(answers))

In [None]:
correct = 0
for a_list, r_list in zip(answers, results3):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        if r == a_list[i]:
            correct += 1
correct

### 50 per prompt

In [None]:
# Load the WN18RR test set from a the txt file available here https://github.com/villmow/datasets_knowledge_embedding/raw/master/WN18RR/text/test.txt
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)

# Convert the test set to a list of triplets
test_triplets = [(triplet[0], triplet[1], triplet[2]) for triplet in wn18rr_test_set.values]

# Chunk the test triplets into batches of size 50
batch_size = 70
num_batches = (len(test_triplets) + batch_size - 1) // batch_size
triplet_batches = [
    test_triplets[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)
]

In [None]:
# Initialize lists to store responses
responses4 = []
generated_entities4 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses4_file = open('results/responses4.txt', 'a')
    generated_entities4_file = open('results/generated_entities4.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]\nNotice: 1.DO NOT GIVE ME THE RELATION OR OTHER FALSY WORDS IN YOUR RESPONSE!\n2. THE LENGTH OF YOUR RESPONSE SHOULD BE: "
    prompt += str(len(triplet_batch))

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses4.append(response)
    generated_entities4.append(generated_entities)

    responses4_file.write(str(response)+ '\n')
    generated_entities4_file.write(str(generated_entity)+ '\n')

    # Close files
    responses4_file.close()
    generated_entities4_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

In [None]:
ge4_file = open('results/generated_entities4.txt', 'r')
results4 = []

for line in ge4_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results4.append(array)
    except: 
        results4.append([])
len(results4)

In [None]:
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(results4, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(results4))

In [None]:
correct = 0
for a_list, r_list in zip(answers, results4):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        if r == a_list[i]:
            correct += 1
correct

## Few shot prompting

In [4]:
# Load the WN18RR test set from a the txt file available here https://github.com/villmow/datasets_knowledge_embedding/raw/master/WN18RR/text/test.txt
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)

# Convert the test set to a list of triplets
test_triplets = [(triplet[0], triplet[1], triplet[2]) for triplet in wn18rr_test_set.values]

# Chunk the test triplets into batches of size 50
batch_size = 50
num_batches = (len(test_triplets) + batch_size - 1) // batch_size
triplet_batches = [
    test_triplets[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)
]

samples = []
for _ in range(3):
    samples.append(triplet_batches[-1].pop())
samples

[('piciform_bird.n.01', '_hypernym', 'bird.n.01'),
 ('grocery_store.n.01', '_has_part', 'shelf.n.01'),
 ('transmit.v.04', '_derivationally_related_form', 'channelization.n.01')]

In [None]:
# Initialize lists to store responses
responses5 = []
generated_entities5 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses5_file = open('results/responses5.txt', 'a')
    generated_entities5_file = open('results/generated_entities5.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]\nNotice: 1.DO NOT GIVE ME THE RELATION OR OTHER FALSY WORDS LIKE '' IN YOUR RESPONSE!\n2. THE LENGTH OF YOUR RESPONSE SHOULD BE: "
    prompt += str(len(triplet_batch))
    prompt += "\nI give you 3 samples, consider them:\n"
    for i, s in enumerate(samples):
        prompt += f'{str(i+1)}. entity: {s[0]}, relation: {s[1]}, missing entity(answer): {s[2]}\n'
    prompt += 'DO NOT give me these samples in your response!'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses5.append(response)
    generated_entities5.append(generated_entities)

    responses5_file.write(str(response)+ '\n')
    generated_entities5_file.write(str(generated_entity)+ '\n')

    # Close files
    responses5_file.close()
    generated_entities5_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(18)

In [5]:
ge5_file = open('results/generated_entities5.txt', 'r')
results5 = []

for line in ge5_file:
    # Extract the array from the text
    try:
        start_index = line.find("[")
        end_index = line.find("]")
        array_str = line[start_index:end_index+1]
        if start_index == -1:
            continue
        if end_index == -1:
            array_str = '[]'
        # Convert the string representation of the array to a Python list
        array = ast.literal_eval(array_str)

        # Append the extracted array
        results5.append(array)
    except: 
        results5.append([])
len(results5)

63

In [6]:
answers = [[l for j,k,l in i] for i in triplet_batches]
unbalance_count = 0
for i, (r, a) in enumerate(zip(results5, answers)):
    if not len(r) == len(a):
        unbalance_count += 1
'count: '+str(unbalance_count), 'rate: '+str(unbalance_count/len(results5))

('count: 48', 'rate: 0.7619047619047619')

In [8]:
correct = 0
for a_list, r_list in zip(answers, results5):
    for i, r in enumerate(r_list):
        if i>len(a_list)-1:
            break
        if r == a_list[i]:
            correct += 1
correct

11