In [144]:
import pandas as pd

filtered_qs = pd.read_csv('../outputs/filtered_qs.csv', index_col =0)

In [145]:
for i, row in filtered_qs.iterrows():
    foo=row[['text', 'option_1', 'option_2', 'option_3', 'option_4']].dropna().to_list()
    break

foo

['Which would you rather be?', 'Normal', 'Weird']

In [146]:
import json 

output_file = 'batchinput.jsonl'

with open(output_file, 'w') as f:
    for i, row in filtered_qs.iterrows():
        row_dict = {
            "custom_id": f"request-{i}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4.1-mini",
                "messages": [
                    {"role": "system", "content": "You are a data labeler. The data comes from the OkCupid dataset from Kirkegaard et. al. Questions and their multiple choice answers are provided. Your job is to rate the questions on a scale from 1-5 in terms of how useful you think the question is for identifying the user's demographic. For example, beliefs about sex before marriage, cheating on a test may say a lot about a user's cultural background. But their preference for wine vs. beer may just be more random. But remember: users answer questions optionally. So many of the questions are sexual in nature; in these cases, don't assume that they hold valuable information about the user's views on sexuality but that more conservative users probabily skipped them and the questions aren't important. \n\n Begin your response with the score so that our python script can scrape it from the beginning."},
                    {"role": "user", "content": f"{row[['text', 'option_1', 'option_2', 'option_3', 'option_4']].dropna().to_list()}"}
                ],
                "max_tokens": 1000
            }
        }
        f.write(json.dumps(row_dict) + '\n')


In [147]:
from openai import OpenAI
import os

with open(os.path.expanduser("~/openai_apikey"), "r") as f:
    api_key = f.read().strip()

from openai import OpenAI
client = OpenAI(api_key=api_key)

batch_input_file = client.files.create(
    file=open("batchinput.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-Q5edwCUutZ9FZdmr8V1NJx', bytes=687797, created_at=1745701318, filename='batchinput.jsonl', object='file', purpose='batch', status='processed', status_details=None, expires_at=None)


In [148]:
batch_input_file_id = batch_input_file.id
batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "question scores, 4.1-mini"
    }
)
print(batch)

Batch(id='batch_680d49c8f3108190aa9c0279867d4d86', completion_window='24h', created_at=1745701320, endpoint='/v1/chat/completions', input_file_id='file-Q5edwCUutZ9FZdmr8V1NJx', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1745787720, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'question scores, 4.1-mini'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [160]:
batch_retrieved = client.batches.retrieve(batch.id)
print(batch_retrieved.status)

completed


In [161]:
file_response = client.files.content(batch_retrieved.output_file_id)
print(file_response.text)

{"id": "batch_req_680d4fc81294819097bf60f73bdcfc5d", "custom_id": "request-q48", "response": {"status_code": 200, "request_id": "6bcea2365e24fd856c3384b7f00621c6", "body": {"id": "chatcmpl-BQgyByqgblFdugObpTAQ5kKLplceu", "object": "chat.completion", "created": 1745701403, "model": "gpt-4.1-mini-2025-04-14", "choices": [{"index": 0, "message": {"role": "assistant", "content": "2", "refusal": null, "annotations": []}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 195, "completion_tokens": 2, "total_tokens": 197, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_prediction_tokens": 0}}, "service_tier": "default", "system_fingerprint": "fp_79b79be41f"}}, "error": null}
{"id": "batch_req_680d4fc8261481908ee5819e170d2aa3", "custom_id": "request-q46", "response": {"status_code": 200, "request_id": "399edd1438e678aecbe7749cd1be4dda", "body

In [162]:
lines = [json.loads(line) for line in file_response.iter_lines()]

In [163]:
import re 
scores = [line['response']['body']['choices'][0]['message']['content'] for line in lines]
scores = [int(re.search(r'\d', s).group()) for s in scores if re.search(r'\d+', s)]
scores

[2,
 2,
 3,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 1,
 3,
 3,
 1,
 4,
 3,
 3,
 3,
 2,
 2,
 4,
 2,
 2,
 4,
 4,
 4,
 5,
 2,
 3,
 4,
 2,
 4,
 1,
 2,
 3,
 2,
 2,
 1,
 2,
 2,
 3,
 4,
 4,
 5,
 2,
 4,
 3,
 2,
 4,
 3,
 2,
 2,
 2,
 3,
 3,
 3,
 1,
 2,
 1,
 2,
 3,
 3,
 3,
 2,
 4,
 2,
 4,
 2,
 1,
 3,
 2,
 3,
 2,
 3,
 4,
 2,
 2,
 2,
 4,
 2,
 2,
 2,
 3,
 4,
 3,
 2,
 2,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 4,
 1,
 2,
 1,
 2,
 2,
 2,
 3,
 2,
 2,
 2,
 3,
 1,
 2,
 3,
 3,
 2,
 3,
 2,
 3,
 2,
 2,
 3,
 2,
 2,
 1,
 4,
 2,
 2,
 3,
 2,
 2,
 3,
 2,
 2,
 1,
 2,
 2,
 2,
 3,
 3,
 4,
 4,
 3,
 2,
 4,
 4,
 4,
 5,
 5,
 3,
 3,
 3,
 2,
 3,
 4,
 3,
 3,
 2,
 2,
 2,
 4,
 2,
 2,
 3,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 3,
 4,
 2,
 1,
 2,
 1,
 3,
 2,
 2,
 2,
 2,
 4,
 3,
 3,
 2,
 2,
 2,
 3,
 2,
 2,
 4,
 1,
 2,
 4,
 3,
 1,
 2,
 2,
 2,
 3,
 2,
 3,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 4,
 3,
 3,
 3,
 1,
 2,
 2,
 2,
 3,
 1,
 3,
 3,
 4,
 2,
 3,
 3,
 3,
 3,
 2,
 2,
 4,
 2,
 1,
 3,
 3,
 2,
 2,
 3,
 3,
 5,
 4,
 4,
 2,
 2,
 2,


In [164]:
filtered_qs['scores']=scores

In [165]:
filtered_qs['idx'] = range(len(scores))

In [167]:
filtered_qs.sample(10)[['text', 'option_1', 'option_2', 'scores']]

Unnamed: 0,text,option_1,option_2,scores
q19737,Could you live a life full of traveling?,Yes,No,2
q43304,Do you get angry when you lose a game?,Usually,Sometimes,2
q67821,Would it bother you if your partner kept pictu...,Yes.,No.,2
q358084,Do you enjoy intense intellectual conversations?,Yes,No,3
q86366,Could you have respect for someone after havin...,Yes.,No.,3
q31898,How do you feel about zoos?,I think they're lots of fun!,"Sure, they're fine.",2
q234,Does fashion matter to you?,A lot,A little,2
q70302,"Of the following, which do you spend the most ...",Washing and/or conditioning.,Shaving.,1
q55744,"If you were going to have a child, would you w...",Yes.,No.,5
q212814,How important are your political beliefs to you?,Very important,Somewhat important,4


In [168]:
filtered_qs.to_csv('../outputs/question_weights.csv')