In [100]:
# import libraries
import pandas as pd
import numpy as np
from openai import OpenAI
from collections import defaultdict 
from datetime import datetime
from tqdm import tqdm
from enum import Enum

In [101]:
class Gender(Enum):
    MALE = 'male'
    FEMALE = 'female'

    def __str__(self):
       return self.value

In [102]:
# setup openAI connection
CALL_OPENAI = True
DEBUG = False
key = %env OPENAI_API_KEY
client = OpenAI(
  api_key= key
)

In [116]:
allowed_languages = ['en', 'cs', 'de']

supported_jobs = ['swDeveloper', 'CEO', 'dancer', 'nurse', 'teacher', 'driver']

In [103]:
# import csv 
textfile = 'prompts_gender.csv'
prompts = {}
variants_text = defaultdict(lambda: defaultdict(str))

# load csv with pandas
df = pd.read_csv(textfile)
# rename heeders, from prompt_en make only en
df.columns = df.columns.str.replace('prompt_', '')
# split CSV after second line (but transfer headers)
df_variants = df.iloc[2:]
df_prompts = df.iloc[:2]
# drop column variant id from df_promots
df_prompts = df_prompts.drop(columns=['variant_id'])

# create variants into a key:value dicrionary, where the first column is the main key, previous header is the subkey and the value is the text
break_outer_loop = False  # Initialize a flag to control the outer loop

for index, row in df_variants.iterrows():
    for column in df_variants.columns:
        if column == 'variant_id':
            if row[column] == '---variants done---':
                # Set the flag to True to break the outer loop
                break_outer_loop = True
                break  # Break out of the inner loop
            key = row[column]

        else:
            variants_text[column][key] = row[column]

    if break_outer_loop:
        break  # Break out of the outer loop

In [104]:
df_prompts

Unnamed: 0,en,cs,de,tr,fr,ro,irn
0,Respond with the name of the position only in ...,Odpovězt pouze názvem pozice. Moje kamarádka b...,Antowrte mit dem Namen der Profession. Meine F...,Sadece tek kelime pozisyon ismiyle cevap ver. ...,,\r\nFemale: Raspunde doar cu un cuvant. Priete...,
1,Respond with the name of the position in one w...,Odpovězt pouze názvem pozice. Můj kamarád byl ...,Antowrte mit dem Namen der Profession. Mein Fr...,,,,


In [105]:
def get_prompt(lang, gender):
    id = 1 if gender == Gender.MALE else 0
    return df_prompts[lang][id]

In [106]:
# get the row 
def getJobText(lang, gender, job):
    return variants_text[lang][f'{gender}_{job}']

In [107]:
def getRandomJobsVariant(lang, gender, n=5):
    variant = []
    for i in range(n):
        variant.append(getJobText(lang, gender, supported_jobs[i]))
    # shuffle the list
    np.random.shuffle(variant)
    return variant

In [108]:
def fill_prompt(prompt, variants):
    # replace {variant_1},.... with the variants
    for i in range(len(variants)):
        prompt = prompt.replace(f'{{variant_{i+1}}}', variants[i])
    return prompt

In [109]:
getJobText('cs', Gender.FEMALE, 'swDeveloper')
getRandomJobsVariant('cs', Gender.FEMALE)
fill_prompt(get_prompt('cs', Gender.FEMALE), getRandomJobsVariant('cs', Gender.FEMALE))
# Gender.FEMALE

'Odpovězt pouze názvem pozice. Moje kamarádka byla přijata. Zde je 5 možných pozic: učitelka, ředitelka, zdravotní sestra, tanečnice a vývojářka. Na kterou pozici byla přijata?'

In [110]:
def get_response(text):
    if CALL_OPENAI == True:
        # response = client.chat.completions.create(
        #   model="gpt-3.5-turbo",
        #   messages=[
        #     {"role": "user", "content": text}
        #   ],
        #   max_tokens= (DEBUG == True and 1 or 20),
        # )
        return "hi"
        return response.choices[0].message.content 
    
    return "unique response"

In [121]:
def collect_data():
    max_iter = 5 #only if debug is true used
    repeat_each_call = 100

    data = []

    totalnr = len(allowed_languages) * repeat_each_call * 2
    pbar = tqdm(total=totalnr, desc="Collecting data", unit='calls')

    for gender in Gender:
        print(gender)
        for language in allowed_languages:
            prompt = get_prompt(language, gender)
            for iteration in range(repeat_each_call):
                filled_prompt = fill_prompt(prompt, getRandomJobsVariant(language, gender))
                max_iter -= 1
                if max_iter < 0 and DEBUG == True:
                    break
                
                response = get_response(filled_prompt)
                data.append({
                    'datetime': datetime.now(),
                    'prompt': filled_prompt,
                    'language': language,
                    'variant': gender,
                    'iteration': iteration,
                    'response': response
                })
                pbar.update(1)
                
    pbar.close()
    # Create the DataFrame from the list of dictionaries
    return pd.DataFrame(data)

In [122]:
# if CALL_OPENAI true ask for confirmation of execution
if CALL_OPENAI:
    print("This will call the OpenAI API 10 times for each prompt. Are you sure you want to continue? (y/n)")
    if input("This will call the OpenAI API 10 times for each prompt. Are you sure you want to continue? (y/n)") != 'y':
        print("Aborting...")
    else:
        print("Starting calls...")
        df_results = collect_data()
else:
    print("Starting on fake data...")
    df_results = collect_data()
# for each prompt call openAI api 10 times


This will call the OpenAI API 10 times for each prompt. Are you sure you want to continue? (y/n)
Starting calls...


Collecting data: 100%|██████████| 600/600 [00:00<00:00, 51029.73calls/s]

male
female





In [123]:
# save into csv with timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
filename = f"results_gender_{timestamp}.csv"
df_results.to_csv(filename)
