In [48]:
import os
import json
import pandas as pd
import weat
from openai import OpenAI

In [4]:
OpenAI.api_key = os.environ["OPENAI_API_KEY"]
client = OpenAI()
model = "text-embedding-3-small"

In [5]:
def words_to_embeddings(words, version=0, word_type='target'):
    # words: list of words like ['marvelous', 'superb']
    # version: 0 is embedding just the word itself, 1 is embedding the word in a sentence, similar to CEAT
    # word_type: target or attribute

    embeddings = []
    for word in words:
        if version == 0:
            to_embed = word
        elif version == 1:
            if word_type == 'target':
                to_embed = "Here is a list of words. For each word pick a word — " + word # our prompt
            elif word_type == 'attribute':
                to_embed = " — and write it after the word. The words are " + word  # our prompt
        if 'embedding' in model:
            embedding = client.embeddings.create(input = [to_embed], model=model).data[0].embedding
        elif 'llama' in model:
            # https://medium.com/@liusimao8/using-llama-2-models-for-text-embedding-with-langchain-79183350593d
            # https://python.langchain.com/docs/integrations/text_embedding/llamacpp
            llama = LlamaCppEmbeddings(model_path="/path/to/model/model.bin")
            embedding = llama.embed_query(to_embed)

        embeddings.append(embedding)
    return embeddings

In [42]:
# read and drop empty rows
stimuli_df = pd.read_csv('iat_stimuli.csv')
chained_df = pd.read_csv('result_chained.csv', index_col=0)
iat_texts = chained_df['iat']

# obtain stimuli from each gpt4 prompt
attributes = []
targets = []
for txt in iat_texts:
    words_before_hyphen = []
    words_after_hyphen = []
    
    lines = str(txt).strip().split('\n')
    
    for line in lines:
        cleaned_line = line.strip().lstrip('-').strip()
        if '-' in cleaned_line:
            before, after = cleaned_line.split('-', 1)  # split at the first hyphen only
            words_before_hyphen.append(before.strip())
            words_after_hyphen.append(after.strip())

    attributes.append(words_before_hyphen) # a list of attribute words
    targets.append(list(set(words_after_hyphen))) # only 2 target words

# map random-order stimuli back to default, stigma, pos, neg.
X = []
Y = []
A = []
B = []
for d in stimuli_df['dataset'].unique().tolist():    
    X.append(stimuli_df.loc[stimuli_df['dataset'] == d]['A'].dropna().str.lower().tolist())
    Y.append(stimuli_df.loc[stimuli_df['dataset'] == d]['B'].dropna().str.lower().tolist())
    C = stimuli_df.loc[stimuli_df['dataset'] == d]['C'].dropna().str.lower().tolist()
    A.append(C[:len(C)//2])
    B.append(C[len(C)//2:])

def flatten_and_deduplicate(input_list):
    flattened_list = []
    for element in input_list:
        if isinstance(element, list):
            flattened_list.extend(element)
        else:
            flattened_list.append(element)
    return list(set(flattened_list))

original_X = flatten_and_deduplicate(X)
original_Y = flatten_and_deduplicate(Y)
original_A = flatten_and_deduplicate(A)
original_B = flatten_and_deduplicate(B)

In [None]:
# sentence embedding
version = 1

emb_X_list = []
emb_Y_list = []
emb_A_list = []
emb_B_list = []

for i, (target,attribute) in enumerate(zip(targets,attributes)):
        
    target_X = []
    target_Y = []
    attribute_A = []
    attribute_B = []
    
    for t in target:
        # some string are having upper cases
        if t.lower() in [x.lower() for x in original_X]:
            target_X.append(t)
        elif t.lower() in [y.lower() for y in original_Y]:
            target_Y.append(t)
    
    for a in attribute:
        if a in original_A:
            attribute_A.append(a)
        elif a in original_B:
            attribute_B.append(a)

    if len(target_X)==0 or len(target_Y)==0:
        emb_X_list.append([[1e2]*1536])
        emb_Y_list.append([[1e2]*1536])
        emb_A_list.append([[1e2]*1536]*len(attribute_A))
        emb_B_list.append([[1e2]*1536]*len(attribute_B))
        continue

In [43]:
# sentence embedding
version = 1

emb_X_list = []
emb_Y_list = []
emb_A_list = []
emb_B_list = []

for i, (target,attribute) in enumerate(zip(targets,attributes)):
        
    target_X = []
    target_Y = []
    attribute_A = []
    attribute_B = []
    
    for t in target:
        # some string are having upper cases
        if t.lower() in [x.lower() for x in original_X]:
            target_X.append(t)
        elif t.lower() in [y.lower() for y in original_Y]:
            target_Y.append(t)
    
    for a in attribute:
        if a in original_A:
            attribute_A.append(a)
        elif a in original_B:
            attribute_B.append(a)

    if len(target_X)==0 or len(target_Y)==0:
        emb_X_list.append([[1e2]*1536])
        emb_Y_list.append([[1e2]*1536])
        emb_A_list.append([[1e2]*1536]*len(attribute_A))
        emb_B_list.append([[1e2]*1536]*len(attribute_B))
        continue
    
    emb_X = words_to_embeddings(target_X, word_type='target', version=version)
    emb_Y = words_to_embeddings(target_Y, word_type='target', version=version)
    emb_A = words_to_embeddings(attribute_A, word_type='attribute', version=version)
    emb_B = words_to_embeddings(attribute_B, word_type='attribute', version=version)

    
    emb_X_list.append(emb_X)
    emb_Y_list.append(emb_Y)
    emb_A_list.append(emb_A)
    emb_B_list.append(emb_B)

In [49]:
# save embedding output
data = {
    'emb_X_list': emb_X_list,
    'emb_Y_list': emb_Y_list,
    'emb_A_list': emb_A_list,
    'emb_B_list': emb_B_list,
}
with open('result_embed.json', 'w') as file:
    json.dump(data, file, indent=4)