In [25]:
## This script uses GPT4 to parse sentences using VerbNet-derived semantic roles
# Run using base python 3.9
# James Fodor 2023
#
# See OpenAI instructions: https://platform.openai.com/docs/guides/gpt

# load libraries
import json
from openai import OpenAI
import numpy as np
import sentence_embeds_processing as sep
import seaborn as sns

# load file paths
with open("file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')
sns.set()

# Prepare API for openai embeddings (needs a key to work)
from openai import OpenAI
api_key = sep.load_openai_key('openai_key.txt')
open_ai_client = OpenAI(api_key=api_key)

In [29]:
## Function for parsing sentence pair using GPT4 API
def gpt4_parse(instruction_base, example_sentence, example_output, final_instruction, sentences):
    out = open_ai_client.chat.completions.create(
        model="gpt-4o", 
        messages=[{"role": "user", "content": instruction_base},
                  {"role": "user", "content": example_sentence},
                  {"role": "assistant", "content": example_output},
                  {"role": "user", "content": final_instruction+' '+sentences}], 
        temperature=0)
    return out.choices[0].message.content

In [13]:
## Show available datasets, as specified in the sep module
pairs = True # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 GS2011_processed
1 KS2013_processed
2 Fodor_pilot_2022
3 STS131_processed
4 SICK_relatedness
5 STR_processed
6 STSb_captions_test
7 STSb_forums_test
8 STSb_headlines_test
9 STSb_test
10 STS3k_all


In [14]:
## Load sentence set 

# choose number from those printed above
dataset_name = datasets[10]

# load sentence set into dictionary depending on type
if pairs == True:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['data_pairs_path'], pairs)
else:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['neuro_root'], pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset_name,'with',n,'sentences')

# store in list
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences.append(list(sentences_dict.values()))


loaded STS3k_all with 2800 sentences


In [15]:
## GPT-4 instruction used for parsing
GPT_instruction_base = 'Two sentences are given below. First, identify the main verb in each sentence. Each sentence should only have a single main verb. Use simple present conjugation. \
Second, label the semantic roles in each of these new sentences. Use the roles: \"Agent\", \"Patient\", \"Theme\", \"Time\", \"Manner\", \"Location\" ,\"Trajectory\". \
Print all results in a single list on one line. Print each role regardless of whether it is found in the sentence. Do not explain your answers. \
Here is one example of what to print:'
GPT_instruction_single = 'One sentence is given below. First, identify the main verb in the sentence. The sentence should only have a single main verb. Use simple present conjugation. \
Second, label the semantic roles in the sentence. Use the roles: \"Agent\", \"Patient\", \"Theme\", \"Time\", \"Manner\", \"Location\" ,\"Trajectory\". \
Print the results on one line. Print each role regardless of whether it is found in the sentence. Do not explain your answers. \
Here is one example of what to print:'
example_sentence = 'Food is what people and animals reluctantly eat on Thursdays'
example_output = '{\"Verb\": \"is\", \"Agent\": \"food\", \"Patient\": "NONE", \"Theme\", \"what people and animals eat\", \"Time\": \"on Thursdays\", \"Manner\": \"reluctantly\", \"Location\": "NONE", \"Trajectory\": "NONE"}'
final_instruction = 'Here are the two sentences for you to parse:'
final_instruction_single = 'Here is the sentence for you to parse:'

In [32]:
## Parse a list of sentences and print results
# These should be saved to a .json file, but because the api can break I've found it better to print each sentence to output first.
start_int = 45
for int,sent_pair in enumerate(list(sentences_dict.values())[start_int:381]):
    
    if pairs == True: # if we're parsing a sentence pair
        pair_str = str(sent_pair[0:2])[1:-1]
        parsed_pair = gpt4_parse(GPT_instruction_base, example_sentence, example_output, final_instruction, pair_str)
    elif pairs == False: # if we're parsing one sentence at a time
        single_sent = sent_pair
        parsed_pair = gpt4_parse(GPT_instruction_single, example_sentence, example_output, final_instruction_single, single_sent)
        
    parsed_pair = parsed_pair.replace('\n\n', ', ') # remove errant new-lines
    print('"'+str(int+start_int)+'": ['+parsed_pair+'],') # print results on json form

"45": [{"Verb": "feed", "Agent": "termites", "Patient": "NONE", "Theme": "wood", "Time": "NONE", "Manner": "mainly", "Location": "NONE", "Trajectory": "NONE"} {"Verb": "lives", "Agent": "the young family", "Patient": "NONE", "Theme": "NONE", "Time": "NONE", "Manner": "NONE", "Location": "in a large house", "Trajectory": "NONE"}],
"46": [{"Verb": "make", "Agent": "stimulants", "Patient": "sleep", "Theme": "NONE", "Time": "NONE", "Manner": "difficult", "Location": "NONE", "Trajectory": "NONE"} {"Verb": "runs", "Agent": "the street", "Patient": "NONE", "Theme": "NONE", "Time": "NONE", "Manner": "NONE", "Location": "between rows of terraced houses", "Trajectory": "NONE"}],
"47": [[{"Verb": "begets", "Agent": "violence", "Patient": "further violence", "Theme": "NONE", "Time": "NONE", "Manner": "NONE", "Location": "NONE", "Trajectory": "NONE"}, {"Verb": "cost", "Agent": "the repairs", "Patient": "quite a lot of money", "Theme": "NONE", "Time": "NONE", "Manner": "NONE", "Location": "NONE", "T

In [None]:
## Parse a single sentence
sentence = 'Banks foolishly rely on electronic fund transfer systems.'
gpt4_parse(GPT_instruction_single, example_sentence, example_output, final_instruction_single, sentence)