In [1]:
%env CUDA_VISIBLE_DEVICES=3,4
%env TMPDIR=/raid/users/ryan_cheng/tmp
import os
import glob
import re
import json
import random
import time
import pickle
from absl import app, flags
from tqdm import tqdm
from datetime import datetime
import openai
from openai import OpenAI
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

np.random.seed(0)

from utils import *
import utils
from consistency_eval import *
from education_generation import *

try:
    from vllm import LLM, SamplingParams
    import ray
except ImportError:
    pass

env: CUDA_VISIBLE_DEVICES=3,4
env: TMPDIR=/raid/users/ryan_cheng/tmp
INFO 04-28 21:49:38 __init__.py:190] Automatically detected platform cuda.


In [2]:
with open(os.path.abspath('../ryan_openai.txt'), 'r') as f:
    utils.client = OpenAI(api_key=f.read().rstrip('\n'))

In [3]:
# old therapy convs
# filename = '/nfs/kun2/users/ryan_cheng/consistency_LLMs/therapy/exp/04.22.25/Llama-3.1-8B-Instruct_0_500.json'
# education convs
filename = '/nfs/kun2/users/ryan_cheng/consistency_LLMs/data/education/exp/04.28.25/Llama-3.1-8B-Instruct_0_395.json'

with open("/nfs/kun2/users/ryan_cheng/consistency_LLMs/therapy/config_therapy.json", 'r') as f:
    config_therapy = json.load(f)

with open(filename, 'r') as f:
    data = json.load(f)

with open("./config/education/gpt-4o-mini.json", 'r') as f:
    config_gpt4_mini = json.load(f)

for key, value in config_gpt4_mini.items():
    config[key] = value



for key, value in config_therapy.items():
    prompts[key] = value

In [None]:
"/nfs/kun2/users/ryan_cheng/consistency_LLMs/data/education/exp/04.28.25/Llama-3.1-8B-Instruct_0_395.json"

In [4]:
with open("/nfs/kun2/users/ryan_cheng/consistency_LLMs/config/eval_prompts.json" , 'r') as f:
    eval_prompts = json.load(f)

In [None]:
config['eval_model'] = 'Llama-3.1-8B-Instruct'
config['gpus'] = 1

In [5]:
config['eval_model'] = 'Llama-3.1-70B-Instruct'
config['gpus'] = 2

In [None]:
config['eval_model'] = 'gpt-4o-mini'

In [None]:
config['task_name'] = 'Therapy'

In [None]:
eval_prompts

In [6]:
def extract_list(text):
    pattern = r'\[.*?\]'
    match = re.search(pattern, text)
    if match:
        try:
            return eval(match.group())
        except (SyntaxError, NameError):
            return []
    return[]

In [7]:
prompts['eval_prompts'] = eval_prompts

In [None]:
test_list = ["a", "b", "c", "d", "e", "f", "g"]

In [None]:
prompts["eval_prompts"]["index_consistency_background"]

In [None]:
def format_conversation(conversation):
    return "".join([str(i) + ": " + line for i, line in conversation])

In [None]:
print(format_conversation(data[0]["conversation"][:3]))

In [None]:
def eval_index_consistency(conv_dict, both_agents=False):
    conv_dict['eval_index_consistency'] = []
    conv_dict['P1_index_consistency_score'] = 0
    if both_agents:
        conv_dict['P2_index_consistency_score'] = 0
    p1_utterances = 0
    p2_utterances = 0
    pturn = conv_dict["pturn"]
    for i, line in conv_dict["conversation"]:
        if i < 2: # skip first 2 lines of dialogue
            continue 
        if pturn == 1:
            prompt = prompts["eval_prompts"]["index_consistency"].replace("%SCENARIO_DESC%", prompts["scenario"]) \
                                                                 .replace("%SPEAKER_ROLE%", prompts["agent1_role"]) \
                                                                 .replace("%CONVERSATION%", format_conversation(conv_dict["conversation"][:i])) \
                                                                 .replace("%SPEAKER_LINE%", line)
            if config['verbose']:
                print(prompt)
            output = completion_create(config['eval_model'], config, prompt)
            index_list = extract_list(output)
            conv_dict['eval_index_consistency'].append((i, output))
            conv_dict['P1_index_consistency_score'] += len(index_list)
            p1_utterances += i // 2
            pturn = 2
        elif pturn == 2:
            if both_agents:
                prompt = prompts["eval_prompts"]["index_consistency"].replace("%SCENARIO_DESC%", prompts["scenario"]) \
                                                                     .replace("%SPEAKER_ROLE%", prompts["agent2_role"]) \
                                                                     .replace("%CONVERSATION%", format_conversation(conv_dict["conversation"][:i])) \
                                                                     .replace("%SPEAKER_LINE%", line)
                if config['verbose']:
                    print(prompt)
                output = completion_create(config['eval_model'], config, prompt)
                index_list = extract_list(output)
                conv_dict['eval_index_consistency'].append((i, output))
                conv_dict['P2_index_consistency_score'] += len(index_list)
                p2_utterances += i // 2
            pturn = 1

    if p1_utterances > 0:
        conv_dict['P1_index_consistency_score'] /= p1_utterances
        conv_dict['P1_index_consistency_score'] = 1 - conv_dict['P1_index_consistency_score']
    if p2_utterances > 0 and both_agents:
        conv_dict['P2_index_consistency_score'] /= p2_utterances
        conv_dict['P2_index_consistency_score'] = 1 - conv_dict['P2_index_consistency_score']

    return conv_dict

In [None]:
def eval_index_background_consistency(conv_dict, both_agents=False):
    conv_dict['eval_index_consistency'] = []
    conv_dict['P1_index_consistency_score'] = 0
    if both_agents:
        conv_dict['P2_index_consistency_score'] = 0
    p1_utterances = 0
    p2_utterances = 0
    pturn = conv_dict["pturn"]
    for i, line in conv_dict["conversation"]:
        if i < 2: # skip first 2 lines of dialogue
            continue 
        if pturn == 1:
            prompt = prompts["eval_prompts"]["index_consistency_background"].replace("%SCENARIO_DESC%", prompts["scenario"]) \
                                                                 .replace("%SPEAKER_BACKSTORY%", conv_dict["P1"]) \
                                                                 .replace("%SPEAKER_ROLE%", prompts["agent1_role"]) \
                                                                 .replace("%CONVERSATION%", format_conversation(conv_dict["conversation"][:i])) \
                                                                 .replace("%SPEAKER_LINE%", line)
            if config['verbose']:
                print(prompt)
            output = completion_create(config['eval_model'], config, prompt)
            index_list = extract_list(output)
            conv_dict['eval_index_consistency'].append((i, output))
            conv_dict['P1_index_consistency_score'] += len(index_list)
            p1_utterances += i // 2
            pturn = 2
        elif pturn == 2:
            if both_agents:
                prompt = prompts["eval_prompts"]["index_consistency_background"].replace("%SCENARIO_DESC%", prompts["scenario"]) \
                                                                     .replace("%SPEAKER_BACKSTORY%", conv_dict["P2"]) \
                                                                     .replace("%SPEAKER_ROLE%", prompts["agent2_role"]) \
                                                                     .replace("%CONVERSATION%", format_conversation(conv_dict["conversation"][:i])) \
                                                                     .replace("%SPEAKER_LINE%", line)
                if config['verbose']:
                    print(prompt)
                output = completion_create(config['eval_model'], config, prompt)
                index_list = extract_list(output)
                conv_dict['eval_index_consistency'].append((i, output))
                conv_dict['P2_index_consistency_score'] += len(index_list)
                p2_utterances += i // 2
            pturn = 1

    if p1_utterances > 0:
        conv_dict['P1_index_consistency_score'] /= p1_utterances
        conv_dict['P1_index_consistency_score'] = 1 - conv_dict['P1_index_consistency_score']
    if p2_utterances > 0 and both_agents:
        conv_dict['P2_index_consistency_score'] /= p2_utterances
        conv_dict['P2_index_consistency_score'] = 1 - conv_dict['P2_index_consistency_score']

    return conv_dict

In [None]:
def eval_prompt_consistency(conv_dict):
    conv_dict['eval_prompt_consistency'] = {}
    conv_dict['P1_prompt_consistency_scores'] = {}
    p1_utterances = {}
    
    for key in ["strategy_consistency", "background_consistency", "combined_prompt_consistency"]:
        conv_dict['eval_prompt_consistency'][key] = []
        conv_dict['P1_prompt_consistency_scores'][key] = 0
        p1_utterances[key] = 0

    pturn = conv_dict["pturn"]
    for line in conv_dict["conversation"]:
        line_number = line[0]
        convo_line = line[1]
        if pturn == 1:
            for key in ["strategy_consistency", "background_consistency", "combined_prompt_consistency"]:
                prompt = eval_prompts[key].replace("%SCENARIO_DESC", 'There is a Patient in conversation with a Therapist.') \
                                          .replace("%SPEAKER_ROLE%", config_therapy["agent1_role"]) \
                                          .replace("%SPEAKER_BACKSTORY%", conv_dict["P1"]) \
                                          .replace("%SPEAKER_LINE%", convo_line)
                if config.get('verbose', False):
                    print(prompt)
                output = completion_create(config['eval_model'], config, prompt)
                conv_dict['eval_prompt_consistency'][key].append((line_number, output))
                if "YES" not in output:  # no contradiction
                    conv_dict['P1_prompt_consistency_scores'][key] += 1
                p1_utterances[key] += 1
            pturn = 2
        elif pturn == 2:
            pturn = 1

    for key in ["strategy_consistency", "background_consistency", "combined_prompt_consistency"]:
        if p1_utterances[key] > 0:
            conv_dict['P1_prompt_consistency_scores'][key] /= p1_utterances[key]

    print(conv_dict)
    return conv_dict

In [8]:
def eval_prompt_consistency_ablations(conv_dict):
    
    p1_utterances = {}
    keys = ["combined_prompt_consistency", "forwards_combined_prompt_consistency"]
    if "P1_prompt_consistency_scores" not in conv_dict:
        conv_dict['eval_prompt_consistency'] = {}
        conv_dict['P1_prompt_consistency_scores'] = {}
        index = 0
        for key in keys:
            conv_dict['P1_prompt_consistency_scores'][key] = {}
            conv_dict['eval_prompt_consistency'][key] = []
    else:
        index = len(conv_dict['eval_prompt_consistency'][keys[0]][0]) - 1
        print(index)
    
    for key in keys:
        conv_dict['P1_prompt_consistency_scores'][key][index] = 0
        p1_utterances[key] = 0


    pturn = conv_dict["pturn"]
    for line in conv_dict["conversation"]:
        line_number = line[0]
        convo_line = line[1]
        if pturn == 1:
            for key in keys:
                prompt = eval_prompts[key].replace("%SCENARIO_DESC", 'There is a Patient in conversation with a Therapist.') \
                                          .replace("%SPEAKER_ROLE%", config_therapy["agent1_role"]) \
                                          .replace("%SPEAKER_BACKSTORY%", conv_dict["P1"]) \
                                          .replace("%SPEAKER_LINE%", convo_line)
                if config.get('verbose', False):
                    print(prompt)
                output = completion_create(config['eval_model'], config, prompt)
                if index == 0:
                    conv_dict['eval_prompt_consistency'][key].append([line_number, output])
                else:
                    conv_dict['eval_prompt_consistency'][key][line_number // 2].append(output)

                if "YES" not in output:  # no contradiction
                    conv_dict['P1_prompt_consistency_scores'][key][index] += 1
                p1_utterances[key] += 1
            pturn = 2
        elif pturn == 2:
            pturn = 1

    for key in keys:
        if p1_utterances[key] > 0:
            conv_dict['P1_prompt_consistency_scores'][key][index] /= p1_utterances[key]

    print(conv_dict)
    return conv_dict

In [10]:
eval_prompts

{'strategy_consistency': "%SCENARIO_DESC% Evaluate whether the following line spoken by %SPEAKER_ROLE% contradicts their background. First, describe the interpreted intention of the statement, and whether or not it aligns with the given background of %SPEAKER_ROLE%. Then, provide 1 sentence explaining your reasoning based on the strategy, followed by YES if the line contradicts the given background of %SPEAKER_ROLE%, and NO if it is consistent with the provided background of %SPEAKER_ROLE%. %SPEAKER_ROLE%'s strategy is described as follows:\n%SPEAKER_STRATEGY%\n %SPEAKER_ROLE% spoke the following line: \n%SPEAKER_LINE%\n\n Provide 1 sentence of reasoning followed by YES or NO.\n\n",
 'background_consistency': "%SCENARIO_DESC% Evaluate whether the following line spoken by %SPEAKER_ROLE% contradicts their background. Provide 1 sentence explaining your reasoning based on the background, followed by YES if the line contradicts the given background of %SPEAKER_ROLE%, and NO if it is consist

In [9]:
def eval_prompt_consistency_ablations_education(conv_dict):
    p1_utterances = {}
    keys = ["combined_prompt_consistency", "forwards_combined_prompt_consistency"]
    if "P2_prompt_consistency_scores" not in conv_dict:
        conv_dict['eval_prompt_consistency'] = {}
        conv_dict['P2_prompt_consistency_scores'] = {}
        index = 0
        for key in keys:
            conv_dict['P2_prompt_consistency_scores'][key] = {}
            conv_dict['eval_prompt_consistency'][key] = []
    else:
        index = len(conv_dict['eval_prompt_consistency'][keys[0]][0]) - 1
        print(index)
    
    for key in keys:
        conv_dict['P2_prompt_consistency_scores'][key][index] = 0
        p1_utterances[key] = 0


    pturn = conv_dict["pturn"]
    for line in conv_dict["conversation"]:
        line_number = line[0]
        convo_line = line[1]
        if pturn == 2:
            for key in keys:
                prompt = eval_prompts[key].replace("%SCENARIO_DESC", 'A Teacher is trying to teach a Student about a topic. ') \
                                          .replace("%SPEAKER_ROLE%", "Student") \
                                          .replace("%SPEAKER_BACKSTORY%", conv_dict["P2"]) \
                                          .replace("%SPEAKER_LINE%", convo_line)
                if config.get('verbose', False):
                    print(prompt)
                output = completion_create(config['eval_model'], config, prompt)
                if index == 0:
                    conv_dict['eval_prompt_consistency'][key].append([line_number, output])
                else:
                    conv_dict['eval_prompt_consistency'][key][line_number // 2].append(output)

                if "YES" not in output:  # no contradiction
                    conv_dict['P2_prompt_consistency_scores'][key][index] += 1
                p1_utterances[key] += 1
            pturn = 1
        elif pturn == 1:
            pturn = 2

    for key in keys:
        if p1_utterances[key] > 0:
            conv_dict['P2_prompt_consistency_scores'][key][index] /= p1_utterances[key]

    print(conv_dict)
    return conv_dict

In [None]:
%%capture 

test_convs = []
for conversation in data:
    for i in range(4):
        eval_prompt_consistency_ablations_education(conversation)
    test_convs.append(eval_prompt_consistency_ablations_education(conversation))
    with open("/nfs/kun2/users/ryan_cheng/consistency_LLMs/data/education/exp/04.28.25/ablation_llama70b_Llama-3.1-8B-Instruct_0_395.json", 'w') as f:
        json.dump(test_convs, f, indent=4)  

2025-04-28 21:51:02,044	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


INFO 04-28 21:51:19 config.py:542] This model supports multiple tasks: {'classify', 'reward', 'generate', 'embed', 'score'}. Defaulting to 'generate'.
INFO 04-28 21:51:20 config.py:1401] Defaulting to use mp for distributed inference
INFO 04-28 21:51:20 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='meta-llama/Meta-Llama-3.1-70B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-70B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=12880, download_dir='/raid/users/ryan_cheng/models/', load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoi

In [None]:
config['eval_model']

In [None]:
eval_prompts

In [None]:
%%capture 

test_convs = []
for conversation in data:
    for i in range(4):
        eval_prompt_consistency_ablations(conversation)
    test_convs.append(eval_prompt_consistency_ablations(conversation))
    with open("/nfs/kun2/users/ryan_cheng/consistency_LLMs/therapy/exp/04.22.25/ablation_llama70b_Llama-3.1-8B-Instruct_0_500.json", 'w') as f:
        json.dump(test_convs, f, indent=4)  

57 min 21 sec Llama-3.1-70B-Instruct

In [None]:
test_convs = []
for conversation in data:
    test_convs.append(eval_index_consistency(conversation))
    with open("/nfs/kun2/users/ryan_cheng/consistency_LLMs/therapy/exp/04.22.25/index_llama70b_Llama-3.1-8B-Instruct_0_500.json", 'w') as f:
        json.dump(test_convs, f, indent=4)  

In [None]:
test_convs = []
for conversation in data:
    test_convs.append(eval_prompt_consistency(conversation))
    with open("/nfs/kun2/users/ryan_cheng/consistency_LLMs/therapy/exp/04.22.25/llama8beval_Llama-3.1-8B-Instruct_0_500.json", 'w') as f:
        json.dump(test_convs, f, indent=4)  



In [None]:
example_conv

In [None]:
config_therapy

In [None]:
def eval_prompt_consistency(conv_dict):
    #assert 'eval_prompt_consistency' not in conv_dict # warn if we are replacing metrics we don't mean to overwrite
    conv_dict['eval_prompt_consistency'] = []
    conv_dict['P1_prompt_consistency_score'] = 0
    p1_utterances = 0
    pturn = conv_dict["pturn"]
    for line in conv_dict["conversation"]:
        line_number = line[0]
        convo_line = line[1]
        if pturn == 1:
            prompt = config_therapy["eval_prompts"]["prompt_consistency"].replace("%SPEAKER_ROLE%", config_therapy["agent1_role"]) \
                                                                          .replace("%SPEAKER_BACKSTORY%", conv_dict["P1"]) \
                                                                          .replace("%SPEAKER_LINE%", convo_line)
            if config_llm['verbose']:
                print(prompt)
            output = completion_create(config_llm['eval_model'], config, prompt)
            conv_dict['eval_prompt_consistency'].append((line_number, output))
            if "YES" not in output: # no contradiction
                conv_dict['P1_prompt_consistency_score'] += 1
            p1_utterances += 1
            pturn = 2
        elif pturn == 2:
            pturn = 1
    if p1_utterances > 0:
        conv_dict['P1_prompt_consistency_score'] /= p1_utterances
    print(conv_dict)

    return conv_dict

In [None]:
def eval_prompt_consistency(conv_dict):
    #assert 'eval_prompt_consistency' not in conv_dict # warn if we are replacing metrics we don't mean to overwrite
    conv_dict['eval_prompt_consistency'] = []
    conv_dict['P1_prompt_consistency_score'] = 0
    p1_utterances = 0
    pturn = conv_dict["pturn"]
    for line in conv_dict["conversation"]:
        line_number = line[0]
        convo_line = line[1]
        if pturn == 1:
            prompt = config_therapy["eval_prompts"]["prompt_consistency"].replace("%SPEAKER_ROLE%", config_therapy["agent1_role"]) \
                                                                          .replace("%SPEAKER_BACKSTORY%", conv_dict["P1"]) \
                                                                          .replace("%SPEAKER_LINE%", convo_line)
            if config_llm['verbose']:
                print(prompt)
            output = completion_create(config_llm['eval_model'], config, prompt)
            conv_dict['eval_prompt_consistency'].append((line_number, output))
            if "YES" not in output: # no contradiction
                conv_dict['P1_prompt_consistency_score'] += 1
            p1_utterances += 1
            pturn = 2
        elif pturn == 2:
            pturn = 1
    if p1_utterances > 0:
        conv_dict['P1_prompt_consistency_score'] /= p1_utterances
    print(conv_dict)

    return conv_dict

In [None]:
def eval_prompt_consistency(conv_dict):
    #assert 'eval_prompt_consistency' not in conv_dict # warn if we are replacing metrics we don't mean to overwrite
    conv_dict['eval_prompt_consistency'] = []
    conv_dict['P1_prompt_consistency_score'] = 0
    conv_dict['P2_prompt_consistency_score'] = 0
    p1_utterances = 0
    p2_utterances = 0
    pturn = conv_dict["pturn"]
    for line in conv_dict["conversation"]:
        if pturn == 1:
            prompt = prompts["eval_prompts"]["prompt_consistency"].replace("%SPEAKER_ROLE%", prompts["agent1_role"]) \
                                                                  .replace("%SPEAKER_BACKSTORY%", conv_dict["P1"]) \
                                                                  .replace("%SPEAKER_LINE%", line)
            if config['verbose']:
                print(prompt)
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_prompt_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P1_prompt_consistency_score'] += 1
            p1_utterances += 1
            pturn = 2
        else:
            prompt = prompts["eval_prompts"]["prompt_consistency"].replace("%SPEAKER_ROLE%", prompts["agent2_role"]) \
                                                                  .replace("%SPEAKER_BACKSTORY%", conv_dict["P2"]) \
                                                                  .replace("%SPEAKER_LINE%", line)
            if config['verbose']:
                print(prompt)
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_prompt_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P2_prompt_consistency_score'] += 1
            p2_utterances += 1
            pturn = 1
    
    if p1_utterances > 0:
        conv_dict['P1_prompt_consistency_score'] /= p1_utterances
    if p2_utterances > 0:
        conv_dict['P2_prompt_consistency_score'] /= p2_utterances


In [None]:
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it')

In [None]:
messages = [
    {"role": "user", "content": 'hello world'}
]

In [None]:
tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)