In [1]:
## This script uses GPT4 to parse sentences using VerbNet-derived semantic roles
# Run using base python 3.9
# James Fodor 2023
#
# See OpenAI instructions: https://platform.openai.com/docs/guides/gpt

# load libraries
import json
import openai
import numpy as np
import sentence_embeds_processing as sep
import seaborn as sns

# load file paths
with open("file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')
sns.set()

# Prepare API for openai embeddings (needs a key to work)
import openai
api_key, api_org = sep.load_openai_key('openai_key.txt')
openai.organization = api_org
openai.api_key = api_key

In [2]:
## Function for parsing sentence pair using GPT4 API
def gpt4_parse(instruction_base, example_sentence, example_output, final_instruction, sentences):
    out = openai.ChatCompletion.create(
        model="gpt-4", 
        messages=[{"role": "user", "content": instruction_base},
                  {"role": "user", "content": example_sentence},
                  {"role": "assistant", "content": example_output},
                  {"role": "user", "content": final_instruction+' '+sentences}], 
        temperature=0)
    return out["choices"][0]["message"]["content"]

In [3]:
## Show available datasets, as specified in the sep module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

0 2014 Wehbe\Stimuli\Chapter_9_sentences_final
1 2017 Anderson\Stimuli\stimuli_final
2 2018 Pereira\Stimuli\stimuli_243sentences
3 2018 Pereira\Stimuli\stimuli_384sentences
4 2020 Alice Dataset\Stimuli\stimuli_sentences_final
5 2020 Zhang\Stimuli\test_sentences_final
6 2023 Fodor Dataset\Fodor2023-final240
7 2023 Fodor Dataset\Fodor2023-final192
8 2023 Fodor Dataset\Fodor2023-prelim


In [4]:
## Load sentence set (choose number from those printed above)
dataset = datasets[5]
sentences_dict = sep.load_set_of_sentences(dataset, file_paths_dict['data_pairs_path'], file_paths_dict['data_nonpaired_path'], pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset,'with',n,'sentences')


loaded 2020 Zhang\Stimuli\test_sentences_final with 95 sentences


In [5]:
## GPT-4 instruction used for parsing
GPT_instruction_base = 'Two sentences are given below. First, identify the main verb in each sentence. Each sentence should only have a single main verb. Use simple present conjugation. \
Second, label the semantic roles in each of these new sentences. Use the roles: \"Agent\", \"Patient\", \"Theme\", \"Time\", \"Manner\", \"Location\" ,\"Trajectory\". \
Print all results in a single list on one line. Print each role regardless of whether it is found in the sentence. Do not explain your answers. \
Here is one example of what to print:'
GPT_instruction_single = 'One sentence is given below. First, identify the main verb in the sentence. The sentence should only have a single main verb. Use simple present conjugation. \
Second, label the semantic roles in the sentence. Use the roles: \"Agent\", \"Patient\", \"Theme\", \"Time\", \"Manner\", \"Location\" ,\"Trajectory\". \
Print the results on one line. Print each role regardless of whether it is found in the sentence. Do not explain your answers. \
Here is one example of what to print:'
example_sentence = 'Food is what people and animals reluctantly eat on Thursdays'
example_output = '{\"Verb\": \"is\", \"Agent\": \"food\", \"Patient\": "NONE", \"Theme\", \"what people and animals eat\", \"Time\": \"on Thursdays\", \"Manner\": \"reluctantly\", \"Location\": "NONE", \"Trajectory\": "NONE"}'
final_instruction = 'Here are the two sentences for you to parse:'
final_instruction_single = 'Here is the sentence for you to parse:'

In [6]:
## Parse a list of sentences and print results
# These should be saved to a .json file, but because the api can break I've found it better to print each sentence to output first.
start_int = 0
for int,sent_pair in enumerate(list(sentences_dict.values())[start_int:]):
    
    if pairs == True: # if we're parsing a sentence pair
        pair_str = str(sent_pair[0:2])[1:-1]
        parsed_pair = gpt4_parse(GPT_instruction_base, example_sentence, example_output, final_instruction, pair_str)
    elif pairs == False: # if we're parsing one sentence at a time
        single_sent = sent_pair
        parsed_pair = gpt4_parse(GPT_instruction_single, example_sentence, example_output, final_instruction_single, single_sent)
        
    parsed_pair = parsed_pair.replace('\n\n', ', ') # remove errant new-lines
    print('"'+str(int+start_int)+'": ['+parsed_pair+'],') # print results on json form

"0": [{"Verb": "started", "Agent": "I", "Patient": "11th and 12th grade English", "Theme": "teaching", "Time": "In January of 2008", "Manner": "NONE", "Location": "at a public school here in the city", "Trajectory": "NONE"}],
"1": [{"Verb": "was", "Agent": "I", "Patient": "22", "Theme": "NONE", "Time": "when I started", "Manner": "NONE", "Location": "NONE", "Trajectory": "NONE"}],
"2": [{"Verb": "were", "Agent": "The kids", "Patient": "NONE", "Theme": "seventeen", "Time": "NONE", "Manner": "NONE", "Location": "NONE", "Trajectory": "NONE"}],
"3": [{"Verb": "looked", "Agent": "They", "Patient": "NONE", "Theme": "like they were 35", "Time": "NONE", "Manner": "NONE", "Location": "NONE", "Trajectory": "NONE"}],
"4": [{"Verb": "believe", "Agent": "They", "Patient": "I", "Theme": "their English teacher", "Time": "NONE", "Manner": "not", "Location": "NONE", "Trajectory": "NONE"}],
"5": [{"Verb": "liked", "Agent": "they", "Patient": "my name", "Theme": "that my name was Mr Manley", "Time": "NON

In [None]:
## Parse a single sentence
sentence = 'Banks foolishly rely on electronic fund transfer systems.'
gpt4_parse(GPT_instruction_single, example_sentence, example_output, final_instruction_single, sentence)