# Preparation

In [96]:
import os
import openai
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import time
import ast

In [85]:
# Load the environment variables from the .env file
load_dotenv()
api_key = os.getenv("API_KEY")

# Set up OpenAI API credentials
openai.api_key = api_key

In [86]:
# Load the WN18RR test set from a the txt file available here https://github.com/villmow/datasets_knowledge_embedding/raw/master/WN18RR/text/test.txt
wn18rr_test_set = pd.read_csv("test.txt", sep="\t", header=None)

# Convert the test set to a list of triplets
test_triplets = [(triplet[0], triplet[1], triplet[2]) for triplet in wn18rr_test_set.values]

# Chunk the test triplets into batches of size 50
batch_size = 50
num_batches = (len(test_triplets) + batch_size - 1) // batch_size
triplet_batches = [
    test_triplets[i * batch_size:(i + 1) * batch_size] for i in range(num_batches)
]

## First prompt format

In [53]:
# Initialize lists to store responses
responses = []
generated_entities = []
response_arrays = []

# Open files in write mode
file1 = open('results/responses.txt', 'w')
file2 = open('results/generated_entities.txt', 'w')

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    prompt = 'Consider the relation prediction task, in the following lines first the entity and second the relation are separated by space. Give me all the missing entities in a single array in your answer:'
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]} {triplet[1]}'

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses.append(response)
    generated_entities.append(generated_entities)

    file1.write(str(response)+ '\n')
    file2.write(str(generated_entity)+ '\n')
    time.sleep(20)

# Close files
file1.close()
file2.close()

  0%|          | 0/63 [00:00<?, ?it/s]

100%|██████████| 63/63 [25:00<00:00, 23.81s/it]


## Second prompt format

In [90]:
# Initialize lists to store responses
responses2 = []
generated_entities2 = []

# Loop through the test triplets and make API calls to ChatGPT
for triplet_batch in tqdm(triplet_batches):
    # Open files in append mode
    responses2_file = open('results/responses2.txt', 'a')
    generated_entities2_file = open('results/generated_entities2.txt', 'a')

    # prompt = "Consider the relation prediction task, in the following lines the given entity comes first and the relation comes second after a space. Missing entities should have same format as given entities. Give me all the missing entities in a single array with this format:\nmissing entities:  ['me1', 'me2', ...]"
    prompt = "I have relation prediction task, consider these entities:"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[0]}'
    prompt += "\nAnd also these relation respectively"
    for triplet in triplet_batch:
        prompt += f'\n{triplet[1]}'
    prompt += "Find just missing entities (NOT THE RELATION OR OTHER FALSY WORDS!) respectively and give them to me in a single array with this format:\nmissing entities:  ['', '', ...]"

    response = openai.Completion.create(
        engine="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=3100,  # Adjust the max tokens as per your requirements
        n=1,  # Generate a single response
        stop=None,  # Let ChatGPT decide when to stop the completion
        temperature=0.6,  # Adjust the temperature as per your requirements
    )
    
    # Extract the generated entity from the API response
    generated_entity = response.choices[0].text.strip()
    
    responses2.append(response)
    generated_entities2.append(generated_entities)

    responses2_file.write(str(response)+ '\n')
    generated_entities2_file.write(str(generated_entity)+ '\n')

    # Close files
    responses2_file.close()
    generated_entities2_file.close()

    # Time sleep for handling ChatGPT request rate limit
    time.sleep(19)

100%|██████████| 63/63 [24:54<00:00, 23.73s/it]


In [95]:
generated_entities2

["missing entities: ['call.v.03', 'blattodea.n.01', 'rickettsiaceae.n.01', 'entoprocta.n.01', 'drill.n.01', 'breadfruit.n.01', 'rest.v.02', 'transformation.n.01', 'sympathizer.n.01', 'ration.n.02', 'transfix.v.02', 'chromosome.n.01', 'surface.n.02', 'accentuation.n.01', 'lexicology.n.01', 'sensual.s.02', 'vanish.v.02', 'sophisticated.a.01', 'ski.v.01', 'brasov.n.01', 'mayhem.n.01', 'composure.n.01', 'coelenterate_family.n.01', 'sop.n.01', 'rail_technology.n.01', 'grant.n.01', 'major_premise.n.01', 'host.v.01', 'docket.v.01', 'polish.v.02', 'kilobyte.n.02', 'aggression.n.03', 'roof.n.01', 'butcher.v.01', 'play.v.26', 'york.n.01', 'system_program.n.01', 'podcast.v.01', 'crocodylidae.n.01', 'evidence.n.02', 'issue.n.02', 'canned_food.n.01', 'crosshatch.v.01', 'acronym.n.01', 'bushbuck.n.01']",
 "missing entities: ['basketball_court.n.01', 'zoology.n.02', 'india.n.01', 'devaluation.n.02', 'astaire.n.01', 'clearing.n.01', 'criminal_negligence.n.01', 'healthy.a.01', 'warp.n.03', 'vanish.v.02

## Process result

In [101]:
results = []
for i, ge in enumerate(generated_entities2):
    # Extract the array from the text
    start_index = ge.find("[")
    end_index = ge.find("]")
    array_str = ge[start_index:end_index+1]
    if end_index == -1:
        array_str = '[]'
    # Convert the string representation of the array to a Python list
    array = ast.literal_eval(array_str)

    # Append the extracted array
    results.append(array)

In [103]:
len(results), len(triplet_batches)

(63, 63)