In [3]:
import pandas as pd
import openai
import os
import json
import requests


# Load construct definitions

In [4]:
# Load the value definitions
with open('other_vals copy 4.json') as f:
    val_list = json.load(f)
val_list
print(f'length of val_list: {len(val_list)}')
with open('schwartz_values_10.json') as f:
    schwartz_list = json.load(f)
schwartz_list


with open("primals_beliefs.json") as f:
    belief_list = json.load(f)

length of val_list: 32


# Load participant data

In [5]:
# load the participant data
df = pd.read_csv('../data/proc/vbbr_bot_jan2025_proc.csv')
pids = df['pid'].unique()
print(len(pids))

283


In [6]:
# load the reappraisals
df_reaps = pd.read_csv('../data/raw/sql_export/reappraisals.csv')
df_reaps = df_reaps[df_reaps['participant_id'].isin(pids)]
# count unique participants
print(len(df_reaps['participant_id'].unique()))
print(f"columns: {df_reaps.columns}")


272
columns: Index(['id', 'issue_id', 'participant_id', 'domain', 'reap_num', 'text',
       'success', 'believable', 'valued', 'relevance', 'created_at',
       'updated_at', 'deleted_at'],
      dtype='object')


In [7]:
df_issues = pd.read_csv('../data/raw/sql_export/issues.csv')
print(f"columns: {df_issues.columns}")

columns: Index(['id', 'participant_id', 'domain', 'neg', 'pos', 'summary', 'created_at',
       'updated_at', 'deleted_at'],
      dtype='object')


In [8]:
# join on domain and id/issue_id
df_issues_reaps = pd.merge(
    df_issues.loc[:, ['domain', 'participant_id', 'summary']],
    df_reaps.loc[:, ['participant_id', 'id', 'domain', 'text']],
    left_on=['domain', 'participant_id'],
    right_on=['domain', 'participant_id'])

print(f"columns: {df_issues_reaps.columns}")    
print(f"shape: {df_issues_reaps.shape}")
display(df_issues_reaps.head())

columns: Index(['domain', 'participant_id', 'summary', 'id', 'text'], dtype='object')
shape: (2692, 5)


Unnamed: 0,domain,participant_id,summary,id,text
0,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,46,It's understandable to feel anxious about bein...
1,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,47,"Social interactions can be daunting, especiall..."
2,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,48,"Feeling awkward can lead to anxiety, but it ca..."
3,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,49,People treating you differently based on perce...
4,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,50,Networking and maintaining friendships might b...


In [122]:
2962*37

109594

In [9]:
df_messages = pd.read_csv('../data/raw/sql_export/messages.csv')


# Other vals batch processing

## Create submission

In [8]:
# prompt_other_vals = """
# <ignore>{salt}</ignore>

# Your task is to determine whether a given cognitive reappraisal for an issue reflects the value of {value}. 
# A value is only incorporated if the active ingredient of the reappraisal, i.e., the crux of what helps someone feel better, revolves around the value of {value}.

# Very briefly (1-3 sentences) walk through your thought process about whether the reappraisal centers on the value of {value} and then ultimately decide whether it does or not by responding with one of the following codes:

# - `Code[1]` for yes
# - `Code[0]` for no

# Be sure to format your response as `Code[1]` or `Code[0]` at the end of your response formatted just like that after walking through your thought process.

# Issue: {issue}
# Reappraisal: {reappraisal}

# Does this cognitive reappraisal center on the value of {value}? 

# {description}

# Be conservative with your answers. Only respond with `Code[1]` if the active ingredient of the given cognitive reappraisal centers around the value of {value} as described above.
# """

In [9]:
prompt_vals_o3 = """
<ignore>{salt}</ignore>

Your task is to determine whether a given cognitive reappraisal for an issue reflects the value of {value}.

{description}

What does someone who values {value} care about? The value of {value} is incorporated in the reappraisal if the active ingredient of the reappraisal, i.e., the crux of what helps someone feel better, is congruent with the desires of someone who values {value}.

Indicate whether the value is or is not incorporated in the reappraisal by responding with one of the following codes:

- `Code[1]` for yes
- `Code[0]` for no

Be sure to format your response as `Code[1]` or `Code[0]`.

<issue> {issue} </issue>
<reappraisal> {reappraisal} </reappraisal>

Does this cognitive reappraisal address the concerns of someone who values {value}? 

Be conservative with your answers. Only respond with `Code[1]` if the active ingredient of the given cognitive reappraisal explicitly centers around the value of {value} as described above. Do not say anything except `Code[1]` or `Code[0]`.
"""

In [None]:
import os
import json
print(f'length of val_list: {len(val_list)}')
print(f'shape of df_issues_reaps: {df_issues_reaps.shape}')
print(f'estimated entries = {len(val_list)*df_issues_reaps.shape[0]}')

with open("batch_other_vals_o3.jsonl", "w", encoding="utf-8") as f_out:
    for _, row in df_issues_reaps.iterrows():
        for val in val_list:
            value_name = val['name']
            description = val['description']
            
            # Extract needed fields
            pid = row['participant_id']
            domain = row['domain']
            reap_id = row['id']             # The "id" column from your reappraisal df
            issue = row['summary']
            reappraisal = row['text']

            # Generate a random salt
            salt = os.urandom(2).hex()

            # Format the system prompt
            system_prompt = prompt_vals_o3.format(
                salt=salt,
                value=value_name,
                issue=issue,
                reappraisal=reappraisal,
                description=description
            )

            # Construct the JSON for each line
            value_name_clean = value_name.replace(" ", "_").lower()
            value_name_clean = value_name_clean.replace("-", "_")
            data = {
                "custom_id": f"{pid}-{domain}-{reap_id}-{value_name_clean}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "o3-mini",
                    "messages": [
                        {
                            "role": "system",
                            "content": system_prompt
                        }
                    ],
                    "reasoning_effort": "low",
                }
            }

            # Write each request as one line of JSON
            f_out.write(json.dumps(data, ensure_ascii=False) + "\n")

length of val_list: 32
shape of df_issues_reaps: (2692, 5)
estimated entries = 86144


## Count tokens

In [None]:
import tiktoken


def num_tokens_from_string(string: str, model: str="gpt-4o") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model)
    num_tokens = len(encoding.encode(string))
    return num_tokens


tokens = 0
for _, row in df_issues_reaps.iterrows():
    for val in val_list:
        value_name = val['name']
        description = val['description']
        
        # Extract needed fields
        pid = row['participant_id']
        domain = row['domain']
        reap_id = row['id']             # The "id" column from your reappraisal df
        issue = row['summary']
        reappraisal = row['text']

        # Generate a random salt
        salt = os.urandom(2).hex()

        # Format the system prompt
        system_prompt = prompt_vals_o3.format(
            salt=salt,
            value=value_name,
            issue=issue,
            reappraisal=reappraisal,
            description=description
        )
        tokens += num_tokens_from_string(system_prompt)
        
print(f"Total tokens: {tokens}")

        


In [19]:
(tokens * .55)/1e6 + (86144 * 2.2 * 250)/1e6

75.30924005000001

## Submit batch

In [22]:
import json
from openai import OpenAI

# Function to read and split the JSONL file into chunks of 50k lines
def read_and_split_jsonl(file_path, max_lines=50000):
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    # Split the lines into chunks of max_lines
    chunks = [lines[i:i + max_lines] for i in range(0, len(lines), max_lines)]
    return chunks


client = OpenAI()

# Read and split the original file
file_path = "batch_other_vals_o3.jsonl"
chunks = read_and_split_jsonl(file_path)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = file_path  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} - vbbr_bot_streamlit - other_vals_o3"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-TVQzLMbWnpKv7wc2we97uD', bytes=106853228, created_at=1738733265, filename='batch_other_vals_o3.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_67a2f6d456f48190b250694d308a9d12', completion_window='24h', created_at=1738733268, endpoint='/v1/chat/completions', input_file_id='file-TVQzLMbWnpKv7wc2we97uD', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1738819668, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 - vbbr_bot_streamlit - other_vals_o3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
FileObject(id='file-5ttqjL32yVMLKeRFvfVE5o', bytes=78580680, created_at=1738733324, filename='batch_other_vals_o3.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_67a2f70f8fec81908e650e8ce8dfb

## Retrieve batch

In [26]:

from openai import OpenAI
client = OpenAI()
# client.batches.cancel("batch_abc123")
openai_files = client.batches.list(limit=6).data
openai_files



[Batch(id='batch_67ab9622736c819085007114cdc811cb', completion_window='24h', created_at=1739298338, endpoint='/v1/chat/completions', input_file_id='file-WKUysKYLdbtqex3YdtyyB7', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739309927, error_file_id=None, errors=None, expired_at=None, expires_at=1739384738, failed_at=None, finalizing_at=1739309679, in_progress_at=1739298341, metadata={'description': 'Chunk 1 - vbbr_bot_streamlit failed lines - other_vals_o3'}, output_file_id='file-P8nrnzZEAbaEnPTkFzRxGZ', request_counts=BatchRequestCounts(completed=2709, failed=0, total=2709)),
 Batch(id='batch_67ab938d9e888190b6c6340480e408d2', completion_window='24h', created_at=1739297677, endpoint='/v1/chat/completions', input_file_id='file-RwDGW1TXuzBVJqH6n5CnWw', object='batch', status='cancelled', cancelled_at=1739298447, cancelling_at=1739297787, completed_at=None, error_file_id='file-UWXbCCPtLydspKXAD6vvtD', errors=None, expired_at=None, expires_at=173

In [27]:
# feb 5
other_vals_1a_file_id = "file-2LsupYC8AwH7mJiMrF3E8E"
other_vals_1b_file_id = "file-QadV6TJE3ZCB6AT9rKsD7N"
data = [
    (client.files.content(other_vals_1a_file_id).text, "1a"),
    (client.files.content(other_vals_1b_file_id).text, "1b"),
]

In [155]:

# other_vals_1a_file_id = "file-W3vNc6KvXoGgHFyvpa9b5Y"
# other_vals_1b_file_id = "file-MVmQWB9omXsmJkcLgwZWNn"
# other_vals_2a_file_id = "file-YCnGSaBAQcdvcQC8whnSF4"
# other_vals_2b_file_id = "file-6NMKm4YCP5qeFVuiUwfaHa"
# other_vals_3a_file_id = "file-5gTkpTa48ZKFAtid7aDqGx"
# other_vals_3b_file_id = "file-AVwgg4wtnBicBDSKw4wuhW"

# data = [
#     (client.files.content(other_vals_1a_file_id).text, "1a"),
#     (client.files.content(other_vals_1b_file_id).text, "1b"),
#     (client.files.content(other_vals_2a_file_id).text, "2a"),
#     (client.files.content(other_vals_2b_file_id).text, "2b"),
#     (client.files.content(other_vals_3a_file_id).text, "3a"),
#     (client.files.content(other_vals_3b_file_id).text, "3b"),
# ]

In [28]:

def convert_batch_output_to_pd(raw_data, batch_num):

    # Split the data into separate JSON strings
    json_objects = raw_data.strip().split('\n')

    # Parse the JSON objects
    parsed_data = [json.loads(obj) for obj in json_objects]

    # Extract relevant fields
    extracted_data = []
    for item in parsed_data:
        response_body = item['response']['body']
        usage = response_body['usage']
        choice = response_body['choices'][0]
        
        extracted_data.append({
            'batch_id': item['id'],
            'batch_num': batch_num,
            'custom_id': item['custom_id'],
            'status_code': item['response']['status_code'],
            'request_id': item['response']['request_id'],
            'completion_id': response_body['id'],
            'model': response_body['model'],
            'created': response_body['created'],
            'assistant_message': choice['message']['content'],
            'finish_reason': choice['finish_reason'],
            'prompt_tokens': usage['prompt_tokens'],
            'completion_tokens': usage['completion_tokens'],
            'total_tokens': usage['total_tokens'],
            'system_fingerprint': response_body['system_fingerprint']
        })

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data)

    df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
    df['reap_id'] = df['reap_id'].astype(int)
    df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
    df['code'] = df['code'].str.extract(r'(\d)')
    df['code'] = df['code'].astype(float)
    
    
    return df

df_other_vals_coding = pd.concat([convert_batch_output_to_pd(data[i][0], data[i][1]) for i in range(len(data))])


In [29]:
df_other_vals_coding.to_csv("df_other_vals_coding.tst.csv", index=False)
# df_other_vals_coding.to_csv("df_other_vals_coding.csv", index=False)

## redoing failed lines

In [10]:
df_other_vals_coding = pd.read_csv("df_other_vals_coding.csv")

In [13]:
failed_ids = df_other_vals_coding.loc[df_other_vals_coding['finish_reason'] != "stop", 'custom_id']
df_other_vals_coding.loc[df_other_vals_coding['finish_reason'] != "stop"].shape

(2709, 19)

### make failed lines file

In [18]:

file_path = "batch_other_vals_o3.jsonl"
new_file_path = "batch_other_vals_o3_failed.jsonl"
# filter lines where the custom_id is in the failed_ids and write to new json
with open(file_path, "r") as f_in, open(new_file_path, "w") as f_out:
    for line in f_in:
        data = json.loads(line)
        if data['custom_id'] in failed_ids.values:
            # remove data['body']['max_completion_tokens']
            del data['body']['max_completion_tokens']
            f_out.write(json.dumps(data, ensure_ascii=False) + "\n")

### submit failed lines

In [None]:

# Read and split the original file
file_path = new_file_path
chunks = read_and_split_jsonl(file_path)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = file_path  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} - vbbr_bot_streamlit failed lines - other_vals_o3"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-WKUysKYLdbtqex3YdtyyB7', bytes=5738139, created_at=1739298337, filename='batch_other_vals_o3_failed.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_67ab9622736c819085007114cdc811cb', completion_window='24h', created_at=1739298338, endpoint='/v1/chat/completions', input_file_id='file-WKUysKYLdbtqex3YdtyyB7', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1739384738, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 - vbbr_bot_streamlit failed lines - other_vals_o3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


### retrieve redone lines

In [22]:

from openai import OpenAI
client = OpenAI()
# client.batches.cancel("batch_abc123")
openai_files = client.batches.list(limit=6).data
openai_files


[Batch(id='batch_67ab9622736c819085007114cdc811cb', completion_window='24h', created_at=1739298338, endpoint='/v1/chat/completions', input_file_id='file-WKUysKYLdbtqex3YdtyyB7', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739309927, error_file_id=None, errors=None, expired_at=None, expires_at=1739384738, failed_at=None, finalizing_at=1739309679, in_progress_at=1739298341, metadata={'description': 'Chunk 1 - vbbr_bot_streamlit failed lines - other_vals_o3'}, output_file_id='file-P8nrnzZEAbaEnPTkFzRxGZ', request_counts=BatchRequestCounts(completed=2709, failed=0, total=2709)),
 Batch(id='batch_67ab938d9e888190b6c6340480e408d2', completion_window='24h', created_at=1739297677, endpoint='/v1/chat/completions', input_file_id='file-RwDGW1TXuzBVJqH6n5CnWw', object='batch', status='cancelled', cancelled_at=1739298447, cancelling_at=1739297787, completed_at=None, error_file_id='file-UWXbCCPtLydspKXAD6vvtD', errors=None, expired_at=None, expires_at=173

In [23]:
file_id = "file-P8nrnzZEAbaEnPTkFzRxGZ"
data = [
    (client.files.content(file_id).text, "2")
]

In [24]:

def convert_batch_output_to_pd(raw_data, batch_num):

    # Split the data into separate JSON strings
    json_objects = raw_data.strip().split('\n')

    # Parse the JSON objects
    parsed_data = [json.loads(obj) for obj in json_objects]

    # Extract relevant fields
    extracted_data = []
    for item in parsed_data:
        response_body = item['response']['body']
        usage = response_body['usage']
        choice = response_body['choices'][0]
        
        extracted_data.append({
            'batch_id': item['id'],
            'batch_num': batch_num,
            'custom_id': item['custom_id'],
            'status_code': item['response']['status_code'],
            'request_id': item['response']['request_id'],
            'completion_id': response_body['id'],
            'model': response_body['model'],
            'created': response_body['created'],
            'assistant_message': choice['message']['content'],
            'finish_reason': choice['finish_reason'],
            'prompt_tokens': usage['prompt_tokens'],
            'completion_tokens': usage['completion_tokens'],
            'total_tokens': usage['total_tokens'],
            'system_fingerprint': response_body['system_fingerprint']
        })

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data)

    df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
    df['reap_id'] = df['reap_id'].astype(int)
    df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
    df['code'] = df['code'].str.extract(r'(\d)')
    df['code'] = df['code'].astype(float)
    
    
    return df

df_other_vals_coding_failed = pd.concat([convert_batch_output_to_pd(data[i][0], data[i][1]) for i in range(len(data))])


In [25]:
df_other_vals_coding_failed.to_csv("df_other_vals_coding_failed.csv", index=False)

## redo failed 2

In [None]:
import pandas as pd
df_redo = pd.read_csv("~/Downloads/tmpasldkfj.csv")
val_map = {
    "personalGrowth": "growth",
    "interpersonalConnection": "connection",
    "humanDiversity": "diversity",
    "interpersonalHarmony": "interpersonal_harmony",
    "selfControl": "self_control",
}
val_map_rev = {v: k for k, v in val_map.items()}
# df_redo['value_name'].unique()
with open("batch_other_vals_o3.jsonl", "r") as f:
    with open("batch_other_vals_o3_failed_2.jsonl", "w") as f_out:
        lines = f.readlines()
        for line in lines:
            pid, domain, reap_id, val = json.loads(line)['custom_id'].split("-") 
            val = val_map_rev[val] if val in val_map_rev else val
            if not df_redo.loc[(df_redo['reap_id'] == int(reap_id)) & (df_redo['value_name'] == val)].empty:
                f_out.write(line)
        

### submit failed lines

In [52]:

# Read and split the original file
file_path = "batch_other_vals_o3_failed_2.jsonl"
chunks = read_and_split_jsonl(file_path)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = file_path  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} - vbbr_bot_streamlit failed lines 2 - other_vals_o3"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-N34eP5VtECgkCTqwt8KNfH', bytes=287316, created_at=1739667147, filename='batch_other_vals_o3_failed_2.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_67b136cd343c8190b5dc626df2d99694', completion_window='24h', created_at=1739667149, endpoint='/v1/chat/completions', input_file_id='file-N34eP5VtECgkCTqwt8KNfH', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1739753549, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 - vbbr_bot_streamlit failed lines 2 - other_vals_o3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


### retrieve redone lines

In [15]:

from openai import OpenAI
client = OpenAI()
# client.batches.cancel("batch_abc123")
openai_files = client.batches.list(limit=6).data
openai_files


[Batch(id='batch_67b381158a388190be45c51350f41244', completion_window='24h', created_at=1739817237, endpoint='/v1/chat/completions', input_file_id='file-C4i1tXJgjxDHFQtpJ3nh4o', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739817515, error_file_id=None, errors=None, expired_at=None, expires_at=1739903637, failed_at=None, finalizing_at=1739817508, in_progress_at=1739817238, metadata={'description': 'Chunk 1 - vbbr_bot_streamlit failed lines 3 - other_vals_o3'}, output_file_id='file-4u7DbVjuPztQNwiNruxgWa', request_counts=BatchRequestCounts(completed=44, failed=0, total=44)),
 Batch(id='batch_67b136cd343c8190b5dc626df2d99694', completion_window='24h', created_at=1739667149, endpoint='/v1/chat/completions', input_file_id='file-N34eP5VtECgkCTqwt8KNfH', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1739668209, error_file_id='file-53SmLVNCLhPNNPoAp6etTZ', errors=None, expired_at=None, expires_at=1739753549,

In [None]:
from openai import OpenAI

client = OpenAI()
file_id = "file-TyaZrW9fzsy55rvF8CNfTw"
data = [
    (client.files.content(file_id).text, "3")
]

# write to json
with open("batch_other_vals_o3_failed_2_output.jsonl", "w") as f_out:
    for i, (text, num) in enumerate(data):
        f_out.write(text)


In [6]:
import json
import pandas as pd

def convert_batch_output_to_pd(raw_data, batch_num):

    # Split the data into separate JSON strings
    json_objects = raw_data.strip().split('\n')

    # Parse the JSON objects
    parsed_data = [json.loads(obj) for obj in json_objects]

    # Extract relevant fields
    extracted_data = []
    for item in parsed_data:
        response_body = item['response']['body']
        usage = response_body['usage']
        choice = response_body['choices'][0]
        
        extracted_data.append({
            'batch_id': item['id'],
            'batch_num': batch_num,
            'custom_id': item['custom_id'],
            'status_code': item['response']['status_code'],
            'request_id': item['response']['request_id'],
            'completion_id': response_body['id'],
            'model': response_body['model'],
            'created': response_body['created'],
            'assistant_message': choice['message']['content'],
            'finish_reason': choice['finish_reason'],
            'prompt_tokens': usage['prompt_tokens'],
            'completion_tokens': usage['completion_tokens'],
            'total_tokens': usage['total_tokens'],
            'system_fingerprint': response_body['system_fingerprint']
        })

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data)

    df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
    df['reap_id'] = df['reap_id'].astype(int)
    df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
    df['code'] = df['code'].str.extract(r'(\d)')
    df['code'] = df['code'].astype(float)
    
    
    return df

df_other_vals_coding_failed = pd.concat([convert_batch_output_to_pd(data[i][0], data[i][1]) for i in range(len(data))])


In [7]:
df_other_vals_coding_failed.to_csv("df_other_vals_coding_failed_2.csv", index=False)

# Redo failed lines 3

In [11]:
df_tmp = pd.read_csv("~/Downloads/tmpsladkfjas.csv")
failed_ids = df_tmp['custom_id'].values
failed_ids

array(['66614c693d129e9d6e6bfa10-relationship-230-growth',
       '66a398080da27efcc3e8ecef-career-497-success',
       '5d9b5e2d8465bf02e1190626-relationship-562-interpersonal_harmony',
       '60b76ee2219ac1ce25ccea43-relationship-427-benevolence',
       '665cbb56d9ecb20564b9b607-career-265-stability',
       '665cbb56d9ecb20564b9b607-career-261-health',
       '66614c693d129e9d6e6bfa10-career-308-drive',
       '669dd09b9f38ff047dce18b0-relationship-340-interpersonal_harmony',
       '66ac140ffbe9e717e0d21357-relationship-342-benevolence',
       '670170d009f9a820a5d1e654-relationship-774-health',
       '64d50a995a3627ba7ebeccb7-relationship-489-collaboration',
       '662a05cb8d07939ee23e41a4-relationship-437-connection',
       '65172a5accd4f7b31d650d6f-relationship-1099-community',
       '671b563f47fb11fe5e685f43-career-1590-connection',
       '5dd352c51c219b35931aefd1-career-1023-collaboration',
       '67374b226c6050f8e53e8c27-relationship-1054-perseverance',
       '65b98f

In [12]:

file_path = "batch_other_vals_o3.jsonl"
new_file_path = "batch_other_vals_o3_failed_3.jsonl"
# filter lines where the custom_id is in the failed_ids and write to new json
with open(file_path, "r") as f_in, open(new_file_path, "w") as f_out:
    for line in f_in:
        data = json.loads(line)
        if data['custom_id'] in failed_ids:
            # remove data['body']['max_completion_tokens']
            del data['body']['max_completion_tokens']
            f_out.write(json.dumps(data, ensure_ascii=False) + "\n")

In [14]:

# Function to read and split the JSONL file into chunks of 50k lines
def read_and_split_jsonl(file_path, max_lines=50000):
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    # Split the lines into chunks of max_lines
    chunks = [lines[i:i + max_lines] for i in range(0, len(lines), max_lines)]
    return chunks

# Read and split the original file
file_path = new_file_path
chunks = read_and_split_jsonl(file_path)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = file_path  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} - vbbr_bot_streamlit failed lines 3 - other_vals_o3"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-C4i1tXJgjxDHFQtpJ3nh4o', bytes=92791, created_at=1739817236, filename='batch_other_vals_o3_failed_3.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_67b381158a388190be45c51350f41244', completion_window='24h', created_at=1739817237, endpoint='/v1/chat/completions', input_file_id='file-C4i1tXJgjxDHFQtpJ3nh4o', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1739903637, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 - vbbr_bot_streamlit failed lines 3 - other_vals_o3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [17]:
from openai import OpenAI

client = OpenAI()
file_id = "file-4u7DbVjuPztQNwiNruxgWa"
data = [
    (client.files.content(file_id).text, "4")
]

# write to json
with open("batch_other_vals_o3_failed_3_output.jsonl", "w") as f_out:
    for i, (text, num) in enumerate(data):
        f_out.write(text)


In [18]:
import json
import pandas as pd

def convert_batch_output_to_pd(raw_data, batch_num):

    # Split the data into separate JSON strings
    json_objects = raw_data.strip().split('\n')

    # Parse the JSON objects
    parsed_data = [json.loads(obj) for obj in json_objects]

    # Extract relevant fields
    extracted_data = []
    for item in parsed_data:
        response_body = item['response']['body']
        usage = response_body['usage']
        choice = response_body['choices'][0]
        
        extracted_data.append({
            'batch_id': item['id'],
            'batch_num': batch_num,
            'custom_id': item['custom_id'],
            'status_code': item['response']['status_code'],
            'request_id': item['response']['request_id'],
            'completion_id': response_body['id'],
            'model': response_body['model'],
            'created': response_body['created'],
            'assistant_message': choice['message']['content'],
            'finish_reason': choice['finish_reason'],
            'prompt_tokens': usage['prompt_tokens'],
            'completion_tokens': usage['completion_tokens'],
            'total_tokens': usage['total_tokens'],
            'system_fingerprint': response_body['system_fingerprint']
        })

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data)

    df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
    df['reap_id'] = df['reap_id'].astype(int)
    df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
    df['code'] = df['code'].str.extract(r'(\d)')
    df['code'] = df['code'].astype(float)
    
    
    return df

df_other_vals_coding_failed = pd.concat([convert_batch_output_to_pd(data[i][0], data[i][1]) for i in range(len(data))])


In [19]:
df_other_vals_coding_failed.to_csv("df_other_vals_coding_failed_3.csv", index=False)

# Schwartz batch processing

## Create submission

In [45]:
prompt_schwartz = """
<ignore>{salt}</ignore>

Your task is to determine whether a given cognitive reappraisal for an issue reflects the value of {value}. 
A value is only incorporated if the active ingredient of the reappraisal, i.e., the crux of what helps someone feel better, revolves around the value of {value}.

Very briefly (1-3 sentences) walk through your thought process about whether the reappraisal centers on the value of {value} and then ultimately decide whether it does or not by responding with one of the following codes:

- `Code[1]` for yes
- `Code[0]` for no

Be sure to format your response as `Code[1]` or `Code[0]` at the end of your response formatted just like that after walking through your thought process.

Issue: {issue}
Reappraisal: {reappraisal}

Does this cognitive reappraisal center on the value of {value}? 

{description}

Be conservative with your answers. Only respond with `Code[1]` if the active ingredient of the given cognitive reappraisal centers around the value of {value} as described above.
"""

In [48]:
import os
import json


with open("batch_schwartz.jsonl", "w", encoding="utf-8") as f_out:
    for _, row in df_issues_reaps.iterrows():
        for val in schwartz_list:
            value_name = val['name']
            description = val['description']
            
            # Extract needed fields
            pid = row['participant_id']
            domain = row['domain']
            reap_id = row['id']             # The "id" column from your reappraisal df
            issue = row['summary']
            reappraisal = row['text']

            # Generate a random salt
            salt = os.urandom(8).hex()

            # Format the system prompt
            system_prompt = prompt_schwartz.format(
                salt=salt,
                value=value_name,
                issue=issue,
                reappraisal=reappraisal,
                description=description
            )

            # Construct the JSON for each line
            value_name_clean = value_name.replace(" ", "_").lower()
            value_name_clean = value_name_clean.replace("-", "_")
            data = {
                "custom_id": f"{pid}-{domain}-{reap_id}-{value_name_clean}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": MODEL,
                    "messages": [
                        {
                            "role": "system",
                            "content": system_prompt
                        }
                    ],
                    "temperature": TEMP,
                    "max_tokens": 120,
                }
            }

            # Write each request as one line of JSON
            f_out.write(json.dumps(data, ensure_ascii=False) + "\n")

## Submit batch

In [49]:
import json
from openai import OpenAI

# Function to read and split the JSONL file into chunks of 50k lines
def read_and_split_jsonl(file_path, max_lines=50000):
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    # Split the lines into chunks of max_lines
    chunks = [lines[i:i + max_lines] for i in range(0, len(lines), max_lines)]
    return chunks


client = OpenAI()

# Read and split the original file
file_path = "batch_schwartz.jsonl"
chunks = read_and_split_jsonl(file_path)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = file_path  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} of batch processing - schwartz"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-12yNGJyxJRx9B1SFAsqDYW', bytes=58402876, created_at=1736980215, filename='batch_schwartz.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_678836f96e648190a7177930cc16dc26', completion_window='24h', created_at=1736980217, endpoint='/v1/chat/completions', input_file_id='file-12yNGJyxJRx9B1SFAsqDYW', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1737066617, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 of batch processing - schwartz'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


# Belief batch processing

## Create belief submission

In [31]:
# prompt_primals = """
# <ignore>{salt}</ignore>

# Your task is to determine whether a given cognitive reappraisal for an issue reflects the belief that the world is {belief}. 

# Very briefly (1-3 sentences) walk through your thought process about whether the reappraisal reflects the belief that the world is {belief} and then ultimately decide whether it does or not by responding with one of the following codes:

# - `Code[1]` for yes
# - `Code[0]` for no

# Be sure to format your response as `Code[1]` or `Code[0]` at the end of your response formatted just like that after walking through your thought process.

# Issue: {issue}
# Reappraisal: {reappraisal}

# Does this cognitive reappraisal reflect the belief that the world is {belief}? 

# Believing that the world is {belief} is defined as follows:
# {description}

# Be conservative with your answers. Only respond with `Code[1]` if the active ingredient of the given cognitive reappraisal very clearly and explicitly reflects the belief that the world is {belief} as defined above.
# """


prompt_primals_o3 = """
<ignore>{salt}</ignore>

Your task is to determine whether a given cognitive reappraisal for an issue incorporates the belief that the world is {belief}. 

<{belief}-belief-definition> {description} </{belief}-belief-definition>

A reappraisal reflects the belief that the world is {belief} if its main emotional relief or “active ingredient” specifically rests on viewing this single situation as an example that the world is, in essence, {belief}. It’s enough to show that the reappraisal treats this particular situation as evidence or an instantiation of the broader outlook that “the world is {belief},“ even if it does not claim the entire world is always that way.


Indicate whether the belief is or is not incorporated in the reappraisal by responding with one of the following codes:

- `Code[1]` for yes
- `Code[0]` for no

Be sure to format your response as `Code[1]` or `Code[0]`.

<issue> {issue} </issue>
<reappraisal> {reappraisal} </reappraisal>

Be conservative with your answers. Only respond with `Code[1]` if the active ingredient of the given cognitive reappraisal clearly and explicitly is evidence for the view that the world is {belief} as described above. Otherwise, respond with `Code[0]`.
"""

In [42]:

import os
import json
from datetime import datetime, timezone

batch_primals_submission_fpath = f"batch_primals_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.jsonl"
with open(batch_primals_submission_fpath, "w", encoding="utf-8") as f_out:
    for _, row in df_issues_reaps.iterrows():
        for b in belief_list:
            belief_name = b['name']
            description = b['description']
            
            # Extract needed fields
            pid = row['participant_id']
            domain = row['domain']
            reap_id = row['id']             # The "id" column from your reappraisal df
            issue = row['summary']
            reappraisal = row['text']

            # Generate a random salt
            salt = os.urandom(2).hex()

            # Format the system prompt
            system_prompt = prompt_primals_o3.format(
                salt=salt,
                belief=belief_name,
                issue=issue,
                reappraisal=reappraisal,
                description=description
            )

            # Construct the JSON for each line
            belief_name_clean = belief_name.replace(" ", "_").lower()
            belief_name_clean = belief_name_clean.replace("-", "_")
            data = {
                "custom_id": f"{pid}-{domain}-{reap_id}-{belief_name_clean}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "o3-mini",
                    "messages": [
                        {
                            "role": "system",
                            "content": system_prompt
                        }
                    ],
                    "reasoning_effort": "low",
                }
            }

            # Write each request as one line of JSON
            f_out.write(json.dumps(data, ensure_ascii=False) + "\n")

In [43]:
import json
from openai import OpenAI

# Function to read and split the JSONL file into chunks of 50k lines
def read_and_split_jsonl(file_path, max_lines=50000):
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    # Split the lines into chunks of max_lines
    chunks = [lines[i:i + max_lines] for i in range(0, len(lines), max_lines)]
    return chunks


client = OpenAI()

# Read and split the original file
chunks = read_and_split_jsonl(batch_primals_submission_fpath)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = file_path  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} vbbr_bot_streamlit - primals - o3mini"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-QXUTcXFYe9UJcpWpDyHUwA', bytes=117091986, created_at=1738801980, filename='batch_other_vals_o3.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_67a40340be9c819087b8c1716b5f9259', completion_window='24h', created_at=1738801984, endpoint='/v1/chat/completions', input_file_id='file-QXUTcXFYe9UJcpWpDyHUwA', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1738888384, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 vbbr_bot_streamlit - primals - o3mini'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
FileObject(id='file-VLb71KHTi1JU3pjeq8uVX5', bytes=47268446, created_at=1738801987, filename='batch_other_vals_o3.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_67a40344cc348190adf4baab309b

## Retrieve batch

In [38]:

from openai import OpenAI
client = OpenAI()
# client.batches.cancel("batch_abc123")
openai_files = client.batches.list(limit=6).data
openai_files



[Batch(id='batch_67a3a0370cf48190b208dc434801019f', completion_window='24h', created_at=1738776631, endpoint='/v1/chat/completions', input_file_id='file-7BcJ4sMpkCPUp87qubenr8', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1738789401, error_file_id='file-XYg1GRFsdkRn7wtZtdHHNF', errors=None, expired_at=None, expires_at=1738863031, failed_at=None, finalizing_at=1738786928, in_progress_at=1738776640, metadata={'description': 'Chunk 2 vbbr_bot_streamlit - primals - o3mini'}, output_file_id='file-S8guhPkdw6mVrfSVKt2gBg', request_counts=BatchRequestCounts(completed=19688, failed=304, total=19992)),
 Batch(id='batch_67a3a01200ac8190ad287da58131b299', completion_window='24h', created_at=1738776594, endpoint='/v1/chat/completions', input_file_id='file-F2vnQPLGLHS3KB9BfjVaVg', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1738797664, error_file_id='file-AZCh1S4RaGbPq5x6H18jWX', errors=None, expired_at=None, exp

In [39]:
# feb 5
primals_1a_file_id = "file-MtKP9AyneSKxVrgt4q36b8"
primals_1b_file_id = "file-S8guhPkdw6mVrfSVKt2gBg"
data = [
    (client.files.content(primals_1a_file_id).text, "1a"),
    (client.files.content(primals_1b_file_id).text, "1b"),
]

In [40]:

def convert_batch_output_to_pd(raw_data, batch_num):

    # Split the data into separate JSON strings
    json_objects = raw_data.strip().split('\n')

    # Parse the JSON objects
    parsed_data = [json.loads(obj) for obj in json_objects]

    # Extract relevant fields
    extracted_data = []
    for item in parsed_data:
        response_body = item['response']['body']
        usage = response_body['usage']
        choice = response_body['choices'][0]
        
        extracted_data.append({
            'batch_id': item['id'],
            'batch_num': batch_num,
            'custom_id': item['custom_id'],
            'status_code': item['response']['status_code'],
            'request_id': item['response']['request_id'],
            'completion_id': response_body['id'],
            'model': response_body['model'],
            'created': response_body['created'],
            'assistant_message': choice['message']['content'],
            'finish_reason': choice['finish_reason'],
            'prompt_tokens': usage['prompt_tokens'],
            'completion_tokens': usage['completion_tokens'],
            'total_tokens': usage['total_tokens'],
            'system_fingerprint': response_body['system_fingerprint']
        })

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data)

    df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
    df['reap_id'] = df['reap_id'].astype(int)
    df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
    df['code'] = df['code'].str.extract(r'(\d)')
    df['code'] = df['code'].astype(float)
    
    
    return df

df_primals_coding = pd.concat([convert_batch_output_to_pd(data[i][0], data[i][1]) for i in range(len(data))])


In [41]:
df_primals_coding.to_csv("df_primals_coding.csv", index=False)

In [45]:

from openai import OpenAI
client = OpenAI()
# client.batches.cancel("batch_abc123")
openai_files = client.batches.list(limit=5).data
openai_files

[Batch(id='batch_67a40344cc348190adf4baab309b2cb4', completion_window='24h', created_at=1738801988, endpoint='/v1/chat/completions', input_file_id='file-VLb71KHTi1JU3pjeq8uVX5', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1738809198, error_file_id=None, errors=None, expired_at=None, expires_at=1738888388, failed_at=None, finalizing_at=1738807730, in_progress_at=1738801994, metadata={'description': 'Chunk 2 vbbr_bot_streamlit - primals - o3mini'}, output_file_id='file-XtzeW1rw6yS4tAnJecQwtQ', request_counts=BatchRequestCounts(completed=19992, failed=0, total=19992)),
 Batch(id='batch_67a40340be9c819087b8c1716b5f9259', completion_window='24h', created_at=1738801984, endpoint='/v1/chat/completions', input_file_id='file-QXUTcXFYe9UJcpWpDyHUwA', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1738822443, error_file_id=None, errors=None, expired_at=None, expires_at=1738888384, failed_at=None, finalizing_at=17

In [48]:
# feb 5
from openai import OpenAI
client = OpenAI()
primals_1_file_id = "file-4BsptATkWcHWSdq8f7xK1T"
primals_2_file_id = "file-XtzeW1rw6yS4tAnJecQwtQ"
data = [
    (client.files.content(primals_1_file_id).text, "1a"),
    (client.files.content(primals_2_file_id).text, "1b"),
]

In [49]:

def convert_batch_output_to_pd(raw_data, batch_num):

    # Split the data into separate JSON strings
    json_objects = raw_data.strip().split('\n')

    # Parse the JSON objects
    parsed_data = [json.loads(obj) for obj in json_objects]

    # Extract relevant fields
    extracted_data = []
    for item in parsed_data:
        response_body = item['response']['body']
        usage = response_body['usage']
        choice = response_body['choices'][0]
        
        extracted_data.append({
            'batch_id': item['id'],
            'batch_num': batch_num,
            'custom_id': item['custom_id'],
            'status_code': item['response']['status_code'],
            'request_id': item['response']['request_id'],
            'completion_id': response_body['id'],
            'model': response_body['model'],
            'created': response_body['created'],
            'assistant_message': choice['message']['content'],
            'finish_reason': choice['finish_reason'],
            'prompt_tokens': usage['prompt_tokens'],
            'completion_tokens': usage['completion_tokens'],
            'total_tokens': usage['total_tokens'],
            'system_fingerprint': response_body['system_fingerprint']
        })

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data)

    df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
    df['reap_id'] = df['reap_id'].astype(int)
    df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
    df['code'] = df['code'].str.extract(r'(\d)')
    df['code'] = df['code'].astype(float)
    
    
    return df

df_primals_coding = pd.concat([convert_batch_output_to_pd(data[i][0], data[i][1]) for i in range(len(data))])


In [52]:
df_primals_coding.to_csv("df_primals_coding.csv", index=False)

# Retrieve processed files

## Old retrieve

In [32]:

import json
import pandas as pd


df_issues_merge = pd.read_csv('../data/raw/sql_export/issues.csv')
df_issues_merge = df_issues_merge.rename(columns={'id': 'issue_id', 'summary': 'issue_summary'})
df_issues_merge = df_issues_merge[['issue_id', 'issue_summary']]

df_reaps_merge = pd.read_csv('../data/raw/sql_export/reappraisals.csv')
df_reaps_merge = df_reaps_merge.rename(columns={'id': 'reap_id', 'text': 'reap_text'})
df_reaps_merge = df_reaps_merge[['reap_id', 'issue_id', 'reap_text']]

def convert_batch_output_to_pd(raw_data):

    # Split the data into separate JSON strings
    json_objects = raw_data.strip().split('\n')

    # Parse the JSON objects
    parsed_data = [json.loads(obj) for obj in json_objects]

    # Extract relevant fields
    extracted_data = []
    for item in parsed_data:
        response_body = item['response']['body']
        usage = response_body['usage']
        choice = response_body['choices'][0]
        
        extracted_data.append({
            'batch_id': item['id'],
            'custom_id': item['custom_id'],
            'status_code': item['response']['status_code'],
            'request_id': item['response']['request_id'],
            'completion_id': response_body['id'],
            'model': response_body['model'],
            'created': response_body['created'],
            'assistant_message': choice['message']['content'],
            'finish_reason': choice['finish_reason'],
            'prompt_tokens': usage['prompt_tokens'],
            'completion_tokens': usage['completion_tokens'],
            'total_tokens': usage['total_tokens'],
            'system_fingerprint': response_body['system_fingerprint']
        })

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data)

    df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
    df['reap_id'] = df['reap_id'].astype(int)
    df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
    # merge reaps
    df = df.merge(df_reaps_merge, left_on='reap_id', right_on='reap_id', how='left')
    # merge issues
    df = df.merge(df_issues_merge, left_on='issue_id', right_on='issue_id', how='left')
    # df['code'] = df['code'].astype(int)
    
    return df
    
    

primals_dfs = []
for i, data in enumerate([primals_1, primals_2]):
    data = data.text
    df = convert_batch_output_to_pd(data)
    df['batch'] = i + 1
    primals_dfs.append(df)
    
values_dfs = []
for i, data in enumerate([values_1, values_2, values_3]):
    data = data.text
    df = convert_batch_output_to_pd(data)
    df['batch'] = i + 1
    values_dfs.append(df)
    

In [33]:
primals_df = pd.concat(primals_dfs)
display(primals_df.head())
# primals_df["custom_id"].iloc[0]
values_df = pd.concat(values_dfs)
primals_df.to_csv("primals_coding_results.csv", index=False)
values_df.to_csv("values_coding_results.csv", index=False)

Unnamed: 0,batch_id,custom_id,status_code,request_id,completion_id,model,created,assistant_message,finish_reason,prompt_tokens,...,system_fingerprint,pid,domain,reap_id,dimension,code,issue_id,reap_text,issue_summary,batch
0,batch_req_677f852bf31481909d6d286f73a10f6d,601f5a82dc8ed94a9da4461e-relationship-189-abun...,200,36ad074f307f9c2c460c77c8a2690e8b,chatcmpl-AnfjmXV9Oe79x5Tqs7Z35bDB388mo,gpt-4o-2024-08-06,1736401874,The reappraisal emphasizes the excitement and ...,stop,347,...,fp_703d4ff298,601f5a82dc8ed94a9da4461e,relationship,189,abundant,1,102,"Sometimes, the anticipation and imagination of...",You want to expand your friend group to includ...,1
1,batch_req_677f852c217881908da67f4c0636e5eb,601f5a82dc8ed94a9da4461e-relationship-189-acce...,200,9e7381e1c9f324d6f5d0d9611ee84f5d,chatcmpl-AnfjmJ7XiZ7t76KxU3LYdu732dlaJ,gpt-4o-2024-08-06,1736401874,The reappraisal focuses on the excitement and ...,stop,344,...,fp_b7d65f1a5b,601f5a82dc8ed94a9da4461e,relationship,189,acceptable,0,102,"Sometimes, the anticipation and imagination of...",You want to expand your friend group to includ...,1
2,batch_req_677f852c598881908ec44d762562ac1a,601f5a82dc8ed94a9da4461e-relationship-189-beau...,200,41602ae44bb85f5994ce86dc2f26854c,chatcmpl-Anfjm3TdHf5Zk5pXoafQ7WzIZ3Dmt,gpt-4o-2024-08-06,1736401874,The reappraisal focuses on the excitement and ...,stop,365,...,fp_b7d65f1a5b,601f5a82dc8ed94a9da4461e,relationship,189,beautiful,0,102,"Sometimes, the anticipation and imagination of...",You want to expand your friend group to includ...,1
3,batch_req_677f852c87008190863f83a7cbec7554,601f5a82dc8ed94a9da4461e-relationship-189-chan...,200,0d4a987fa05b19aeb574734205516c3f,chatcmpl-AnfjmEP6aVkvLZfFwkhu2NzGOSQaU,gpt-4o-2024-08-06,1736401874,The cognitive reappraisal focuses on the excit...,stop,392,...,fp_703d4ff298,601f5a82dc8ed94a9da4461e,relationship,189,changing,0,102,"Sometimes, the anticipation and imagination of...",You want to expand your friend group to includ...,1
4,batch_req_677f852cac3c81908db0ff84cbdfe3c1,601f5a82dc8ed94a9da4461e-relationship-189-coop...,200,05aae834fa8bd3f39a0a11919fe260e7,chatcmpl-Anfjm5gSiJ3dXkYvpWH0z5fxTEV2L,gpt-4o-2024-08-06,1736401874,The reappraisal emphasizes the excitement and ...,stop,344,...,fp_5f20662549,601f5a82dc8ed94a9da4461e,relationship,189,cooperative,0,102,"Sometimes, the anticipation and imagination of...",You want to expand your friend group to includ...,1


# Produce embeddings

In [12]:
df_issues_reaps

Unnamed: 0,domain,participant_id,summary,id,text
0,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,46,It's understandable to feel anxious about bein...
1,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,47,"Social interactions can be daunting, especiall..."
2,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,48,"Feeling awkward can lead to anxiety, but it ca..."
3,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,49,People treating you differently based on perce...
4,relationship,6632edd68127fba862de05bf,You experience social anxiety and feel easily ...,50,Networking and maintaining friendships might b...
...,...,...,...,...,...
2687,relationship,66da38dc1d5b9365584508ca,You are experiencing problems with your attitu...,3200,"When we struggle with expressing our emotions,..."
2688,relationship,66da38dc1d5b9365584508ca,You are experiencing problems with your attitu...,3201,Feeling out of control can be reframed as an i...
2689,relationship,66da38dc1d5b9365584508ca,You are experiencing problems with your attitu...,3198,It's important to recognize that your awarenes...
2690,relationship,66da38dc1d5b9365584508ca,You are experiencing problems with your attitu...,3202,Experiencing these conflicts might help you de...


In [17]:

import os
import json
from datetime import datetime, timezone

DIMS = 128

# batch_embed_submission_fpath = f"batch_{DIMS}_embeddings_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.jsonl"
batch_embed_submission_fpath = f"batch_{DIMS}_embeddings.jsonl"
with open(batch_embed_submission_fpath, "w", encoding="utf-8") as f_out:
    for _, row in df_issues_reaps.iterrows():

        # Extract needed fields
        pid = row['participant_id']
        domain = row['domain']
        reap_id = row['id']
        # issue = row['summary']
        reappraisal = row['text']

        # Construct the JSON for each line
        data = {
            "custom_id": f"{pid}-{domain}-{reap_id}",
            "method": "POST",
            "url": "/v1/embeddings",
            "body": {
                "model": "text-embedding-3-small",
                "input": reappraisal,
                "dimensions": DIMS
            }
        }

        # Write each request as one line of JSON
        f_out.write(json.dumps(data, ensure_ascii=False) + "\n")

In [18]:
import json
from openai import OpenAI

# Function to read and split the JSONL file into chunks of 50k lines
def read_and_split_jsonl(file_path, max_lines=50000):
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    # Split the lines into chunks of max_lines
    chunks = [lines[i:i + max_lines] for i in range(0, len(lines), max_lines)]
    return chunks


client = OpenAI()

# Read and split the original file
chunks = read_and_split_jsonl(batch_embed_submission_fpath)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = batch_embed_submission_fpath  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/embeddings",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} of batch processing - embeddings"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-UteLs3hjhk6XLeyaQZwNDZ', bytes=1078710, created_at=1737342656, filename='batch_128_embeddings.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_678dbec3a8488190b81970310368aab3', completion_window='24h', created_at=1737342659, endpoint='/v1/embeddings', input_file_id='file-UteLs3hjhk6XLeyaQZwNDZ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1737429059, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 of batch processing - embeddings'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [27]:

embedding_file_id = "file-EM9qNrDQhLRGwgvgwmJJJu"
embeddings = client.files.content(embedding_file_id).text.strip().split('\n')
embeddings = [json.loads(emb) for emb in embeddings]
# write to json
with open("embeddings.json", "w") as f:
    json.dump(embeddings, f, indent=2)

# import json
# import pandas as pd


# def convert_batch_output_to_pd(raw_data):

#     # Split the data into separate JSON strings
#     json_objects = raw_data.strip().split('\n')

#     # Parse the JSON objects
#     parsed_data = [json.loads(obj) for obj in json_objects]

#     # Extract relevant fields
#     extracted_data = []
#     for item in parsed_data:
#         response_body = item['response']['body']
#         usage = response_body['usage']
#         choice = response_body['choices'][0]
        
#         extracted_data.append({
#             'batch_id': item['id'],
#             'custom_id': item['custom_id'],
#             'status_code': item['response']['status_code'],
#             'request_id': item['response']['request_id'],
#             'completion_id': response_body['id'],
#             'model': response_body['model'],
#             'created': response_body['created'],
#             'assistant_message': choice['message']['content'],
#             'finish_reason': choice['finish_reason'],
#             'prompt_tokens': usage['prompt_tokens'],
#             'completion_tokens': usage['completion_tokens'],
#             'total_tokens': usage['total_tokens'],
#             'system_fingerprint': response_body['system_fingerprint']
#         })

#     # Convert to DataFrame
#     df = pd.DataFrame(extracted_data)

#     df[['pid', 'domain', 'reap_id', 'dimension']] = df['custom_id'].str.split('-', expand=True)
#     df['reap_id'] = df['reap_id'].astype(int)
#     df['code'] = df['assistant_message'].str.extract(r'Code\[(\d)\]')
#     # merge reaps
#     df = df.merge(df_reaps_merge, left_on='reap_id', right_on='reap_id', how='left')
#     # merge issues
#     df = df.merge(df_issues_merge, left_on='issue_id', right_on='issue_id', how='left')
#     # df['code'] = df['code'].astype(int)
    
#     return df
    
    

# primals_dfs = []
# for i, data in enumerate([primals_1, primals_2]):
#     data = data.text
#     df = convert_batch_output_to_pd(data)
#     df['batch'] = i + 1
#     primals_dfs.append(df)
    
# values_dfs = []
# for i, data in enumerate([values_1, values_2, values_3]):
#     data = data.text
#     df = convert_batch_output_to_pd(data)
#     df['batch'] = i + 1
#     values_dfs.append(df)
    

# Rating uncertainty

## Prompt

In [40]:
prompt_uncertainty = """
<ignore>{salt}</ignore>

You are an expert qualitative coder analyzing a person’s description of an emotional issue.
Your task is to assess how uncertain the person about what is happening in their issue, where uncertainty is an appraisal dimension. 
Uncertainty reflects how unclear, ambiguous, or tentative the person is in describing their emotions, events, or interpretations.

Rate the level of uncertainty using the following scale:
	-	5 (Very Certain): The person is clear, confident, and decisive in their description, with no apparent doubt or ambiguity.
	-	4 (Mostly Certain): The description is mostly clear, with occasional minor hesitations or vagueness.
	-	3 (Moderately Uncertain): The person expresses some ambiguity or doubt, with noticeable hedging or a lack of clarity in parts of the narrative.
	-	2 (Highly Uncertain): The description contains frequent hesitations, contradictions, or significant vagueness, suggesting a high level of doubt or ambiguity.
	-	1 (Extremely Uncertain): The person is very unclear, vague, or contradictory throughout their description, indicating extreme uncertainty.

Very briefly (1-3 sentences) walk through your thought process about how uncertain the individual is regarding the emotional issue they're facing and
then at the end provide a rating from 1 to 5 formatted as follows:

- `Rating[5]` for very certain
- `Rating[4]` for mostly certain
- `Rating[3]` for moderately uncertain
- `Rating[2]` for highly uncertain
- `Rating[1]` for extremely uncertain
"""

## Create batch

In [46]:
import os
import json
import itertools


MODEL = "gpt-4o"

all_pids = df_reaps['participant_id'].unique()
all_domains = ["career", "relationship"]
with open("batch_uncertainty.jsonl", "w", encoding="utf-8") as f_out:
    
    for pid, domain in itertools.product(all_pids, all_domains):
        
        
        # Generate a random salt
        salt = os.urandom(8).hex()
        
        sys_prompt = prompt_uncertainty.format(salt=salt)
        
        # get messages for pid domain
        msg_df = df_messages.loc[(df_messages['participant_id'] == pid) & (df_messages['domain'] == domain) & (df_messages['state'] == 'issue')]
        # sort by id
        msg_df = msg_df.sort_values('id')
        
        # turn into a list of messages
        msgs = []
        for _, msg in msg_df.iterrows():
            if (msg["role"] == "assistant") and ("tell me about an issue in your life that" in msg["content"]):
                continue
            # if content is nan, skip
            if pd.isna(msg["content"]):
                continue
            msgs.append({
                "role": msg["role"],
                "content": msg["content"]
            })
        
        sys_prompt_msgs = msgs + [{"role": "system", "content": sys_prompt}]
        
        data = {
            "custom_id": f"{pid}-{domain}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": MODEL,
                "messages": sys_prompt_msgs,
                "temperature": TEMP,
                "max_tokens": 120,
            }
        }
        

        # Write each request as one line of JSON
        f_out.write(json.dumps(data, ensure_ascii=False) + "\n")
        
    
        

In [49]:
import json
from openai import OpenAI

# Function to read and split the JSONL file into chunks of 50k lines
def read_and_split_jsonl(file_path, max_lines=50000):
    with open(file_path, "r") as file:
        lines = file.readlines()
    
    # Split the lines into chunks of max_lines
    chunks = [lines[i:i + max_lines] for i in range(0, len(lines), max_lines)]
    return chunks


client = OpenAI()

# Read and split the original file
file_path = "batch_uncertainty.jsonl"
chunks = read_and_split_jsonl(file_path)

# Process each chunk with the original filename
for i, chunk in enumerate(chunks):
    # Create a temporary in-memory file-like object to upload
    from io import BytesIO
    
    temp_file = BytesIO("".join(chunk).encode("utf-8"))
    temp_file.name = file_path  # Keep the original filename

    # Upload each chunk
    batch_input_file = client.files.create(
        file=temp_file,
        purpose="batch"
    )
    print(batch_input_file)

    # Create a batch for each chunk
    batch_input_file_id = batch_input_file.id
    batch_obj = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"Chunk {i + 1} of batch processing - uncertainty 3"
        }
    )
    print(batch_obj)

    temp_file.close()  # Close the in-memory file

FileObject(id='file-FRKrs5yr3KTJ9KC2c1dp6P', bytes=2918471, created_at=1737422914, filename='batch_uncertainty.jsonl', object='file', purpose='batch', status='processed', status_details=None)
Batch(id='batch_678ef845fbe48190941795610b5417a1', completion_window='24h', created_at=1737422918, endpoint='/v1/chat/completions', input_file_id='file-FRKrs5yr3KTJ9KC2c1dp6P', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1737509318, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Chunk 1 of batch processing - uncertainty 3'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


## Retrieve batch

In [159]:


uncertainty_1_file_id = "file-JqheSw2CHKXhZvAgP9tnPT"
uncertainty_2_file_id = "file-C82tTvU2jp616U3YGptkUC"
uncertainty_3_file_id = "file-DzbxdrvPWDPFEucEgpvmC3"

uncertainty_1_content = client.files.content(uncertainty_1_file_id)
uncertainty_2_content = client.files.content(uncertainty_2_file_id)
uncertainty_3_content = client.files.content(uncertainty_3_file_id)


uncertainty_1_text = uncertainty_1_content.text
uncertainty_2_text = uncertainty_2_content.text
uncertainty_3_text = uncertainty_3_content.text

uncertainty_1 = [json.loads(obj) for obj in uncertainty_1_text.strip().split('\n')]
uncertainty_2 = [json.loads(obj) for obj in uncertainty_2_text.strip().split('\n')]
uncertainty_3 = [json.loads(obj) for obj in uncertainty_3_text.strip().split('\n')]

# uncertainty_combined = uncertainty_1 + uncertainty_2 + uncertainty_3


In [164]:

uncertainty_list = []
for obj in uncertainty_1:
    new_obj = {
        "pid": obj["custom_id"].split("-")[0],
        "domain": obj["custom_id"].split("-")[1],
        "assistant_message": obj["response"]["body"]["choices"][0]["message"]["content"],
        "batch": 1
    }
    uncertainty_list.append(new_obj)
for obj in uncertainty_2:
    new_obj = {
        "pid": obj["custom_id"].split("-")[0],
        "domain": obj["custom_id"].split("-")[1],
        "assistant_message": obj["response"]["body"]["choices"][0]["message"]["content"],
        "batch": 2
    }
    uncertainty_list.append(new_obj)
for obj in uncertainty_3:
    new_obj = {
        "pid": obj["custom_id"].split("-")[0],
        "domain": obj["custom_id"].split("-")[1],
        "assistant_message": obj["response"]["body"]["choices"][0]["message"]["content"],
        "batch": 3
    }
    uncertainty_list.append(new_obj)

uncertainty_df = pd.DataFrame(uncertainty_list)
uncertainty_df['rating'] = uncertainty_df['assistant_message'].str.extract(r'Rating\[(\d)\]')
uncertainty_df['rating'] = uncertainty_df['rating'].str.extract(r'(\d)')
uncertainty_df['rating'] = uncertainty_df['rating'].astype(int)

uncertainty_wide_df = uncertainty_df.pivot(index=["pid", "domain"], columns="batch", values=["assistant_message", "rating"])

# Flatten the multi-level columns for readability
uncertainty_wide_df.columns = [f"{col[0]}_{col[1]}" for col in uncertainty_wide_df.columns]

# Reset index for better usability
uncertainty_wide_df.reset_index(inplace=True)


# print(uncertainty_wide_df.rating.value_counts())
# print(uncertainty_df['rating'].value_counts())

uncertainty_wide_df.to_csv("uncertainty_coding_results.csv", index=False)

In [65]:
df_issues

Unnamed: 0,id,participant_id,domain,neg,pos,summary,created_at,updated_at,deleted_at
0,1,test-20250107_163900,career,,,,2025-01-07 16:38:56.104675,2025-01-07 16:38:56.104690,
1,2,test-20250107_163900,relationship,,,,2025-01-07 16:38:56.104675,2025-01-07 16:38:56.104690,
2,3,test-20250107_163715,career,,,,2025-01-07 16:42:10.269918,2025-01-07 16:42:10.269968,
3,4,test-20250107_163715,relationship,,,,2025-01-07 16:42:10.269918,2025-01-07 16:42:10.269968,
4,5,test-20250107_165457,career,,,,2025-01-07 16:54:54.111640,2025-01-07 16:54:54.111652,
...,...,...,...,...,...,...,...,...,...
705,707,66da38dc1d5b9365584508ca,career,60.0,4.0,You are experiencing a significant drop in inc...,2025-01-08 20:20:17.544185,2025-01-08 20:37:20.284298,
706,705,66639f08a4e81235c3d7a659,career,67.0,30.0,You are facing challenges with maintaining wor...,2025-01-08 20:07:58.468453,2025-01-08 20:37:53.105463,
707,710,65e1204f1bee2811d7894471-2,relationship,,,,2025-01-08 20:39:11.679525,2025-01-08 20:39:11.679526,
708,709,65e1204f1bee2811d7894471-2,career,38.0,50.0,,2025-01-08 20:39:11.679514,2025-01-08 21:05:40.550235,
