In [4]:
## This script uses GPT4 to give similarity ratings to sentence pairs
# Run using base python 3.9
# James Fodor 2023
#
# See OpenAI instructions: https://platform.openai.com/docs/guides/gpt

# load libraries
import json
import numpy as np
import sentence_embeds_processing as sep
import itertools

# load file paths
with open("file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')

# Prepare API for openai embeddings (needs a key to work)
from openai import OpenAI
api_key, api_org = sep.load_openai_key('openai_key.txt')
open_ai_client = OpenAI(api_key=api_key)

In [5]:
## Show available datasets, as specified in the sentence_embeds_processing module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
    
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

0 Wehbe_neuro
1 Anderson_neuro
2 Pereira243_neuro
3 Pereira384_neuro
4 Alice_neuro
5 Zhang_neuro
6 Fodor2023-final192_neuro
7 Fodor2023-final108_neuro


In [8]:
## Load sentence set (choose number from those printed above)
dataset = datasets[7]
sentences_dict = sep.load_set_of_sentences(dataset, file_paths_dict['data_nonpaired_path']+'\\2023 Fodor Dataset\\1 - Stimuli\\Fodor2023-final108.txt', pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset, pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',full_dataset_name,'with',n,'sentences')


loaded Fodor2023-final108_neuro with 108 sentences


In [9]:
## Function for getting sentence-pair ratings from GPT4
def gpt4_ratings(instruction, sentence_pairs):
    out = open_ai_client.chat.completions.create(
        model="gpt-4", 
        messages=[{"role": "user", "content": instruction},
                  {"role": "user", "content": sentence_pairs}], 
        temperature=0)
    return out.choices[0].message.content

In [10]:
instruction = 'You will be presented with two sentences. Your task is to judge how similar is the meaning of the two sentences. \
You will make this judgement by choosing a rating from 0 (most dissimilar) to 1 (most similar) to two decimal places. \
In providing your rating, consider both the similarity in meaning of the individual words contained in the sentences, as well as the similarity of the overall idea or meaning expressed by the sentences. \
Provide a numerical rating only; do not explain your answers. \
Here are the sentences:'

In [11]:
sent_id_pairs = list(itertools.combinations(sentences_dict.keys(), 2))
sent_pairs = []
for sent_id_pair in sent_id_pairs:
    sent_1 = sentences_dict[sent_id_pair[0]]
    sent_2 = sentences_dict[sent_id_pair[1]]
    sent_pairs.append([sent_1,sent_2])
print(len(sent_pairs),'pairs')

5778 pairs


In [12]:
# Iterate through the list of sentence pairs
gpt4_ratings_storage = []
for idx,sent_pair in list(enumerate(sent_pairs))[0:]:
    gpt4_output = gpt4_ratings(instruction, '\n'.join(sent_pair))
    gpt4_ratings_storage.append(gpt4_output)
    if (idx%100)==0:
        print(idx)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700


In [13]:
np.savetxt(full_dataset_name+"_GPT4_rated_similarities3.txt", gpt4_ratings_storage, fmt='%s') 