In [2]:
%env CUDA_VISIBLE_DEVICES=1,7
%env TMPDIR=/raid/users/ryan_cheng/tmp
import os
import glob
import re
import json
import random
import time
import pickle
from absl import app, flags
from tqdm import tqdm
from datetime import datetime
import openai
from openai import OpenAI
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

np.random.seed(0)

from utils import *
import utils
from consistency_eval import *
from education_generation import *

try:
    from vllm import LLM, SamplingParams
    import ray
except ImportError:
    pass

env: CUDA_VISIBLE_DEVICES=1,7
env: TMPDIR=/raid/users/ryan_cheng/tmp


In [3]:
with open(os.path.abspath('../ryan_openai.txt'), 'r') as f:
    utils.client = OpenAI(api_key=f.read().rstrip('\n'))

In [4]:
# choose config file to load into global config dictionary in utils.py
# with open("./config/persona_chat/Llama-3.1-70B-Instruct.json", 'r') as f:
#     config_llama = json.load(f)
with open("./config/education/gpt-4o-mini.json", 'r') as f:
    config_gpt4_mini = json.load(f)

for key, value in config_gpt4_mini.items():
    config[key] = value

# this modifies the global prompts dictionary in utils.py
with open('config/education/prompts.json', 'r') as f:
        new_prompts = json.load(f)

for key, value in new_prompts.items():
    prompts[key] = value

In [4]:
config['seed'] = 1

In [5]:
prompts.keys() 

dict_keys(['agent1_role', 'agent2_role', 'dialogue_prompt_original', 'dialogue_prompt_old', 'dialogue_prompt', 'student_preferences', 'interactive', 'story', 'lecture', 'rude', 'anxious', 'distant', 'teacher_preferences'])

In [6]:
with open('data/education/conversations_train1.json', 'r') as f:
    conversation_prompts = json.load(f)
conversation_prompts[0]['background_info'].keys()

dict_keys(['topic', 'student_prefrences', 'teacher_prefrences', 'student_reactions', 'teacher_reactions'])

In [7]:
stats_template =  {"task_name": "education", "conversation": [], "rounds": 0}

# Generate conversations using Student, Topic

In [14]:
np.random.seed(0)
random_indices = np.random.choice(len(conversation_prompts), size=40, replace=False).astype(int).reshape(-1, 2)
random_indices

array([[3069, 1675],
       [6385,  543],
       [3213,  134],
       [5869, 1425],
       [1819, 7492],
       [5828, 4749],
       [6334, 5387],
       [6303, 2250],
       [4587, 5904],
       [5476, 2491],
       [ 276, 3480],
       [ 273,  162],
       [2164, 2756],
       [7207,   39],
       [7103, 3197],
       [4855, 6406],
       [7351, 3790],
       [3501,  234],
       [3343, 6875],
       [2980, 4977]])

In [None]:
config['verbose'] = False
config['convo_length_limit'] = 15



In [17]:
conversation_prompts[0]['background_info']

{'topic': "Archimedes' Principle",
 'student_prefrences': 'creative expression/story telling/gamification',
 'teacher_prefrences': 'direct instruction/lecture-based learning',
 'student_reactions': 'and gets frustrated otherwise',
 'teacher_reactions': 'and gets frustrated otherwise'}

In [19]:
student_prefs = []
student_reacs = []
for i in range(len(conversation_prompts)):
    student_prefs.append(conversation_prompts[i]['background_info']['student_prefrences'])
    student_reacs.append(conversation_prompts[i]['background_info']['student_reactions'])


print(np.unique(student_prefs))
print(np.unique(student_reacs))

['creative expression/story telling/gamification'
 'direct instruction/lecture-based learning'
 'hands-on activities/real-world applications'
 'interactive learning/class discussions/asking questions']
['and gets anxious otherwise' 'and gets disengaged otherwise'
 'and gets frustrated otherwise' 'and gets rude otherwise'
 'and might tell it to the teacher' 'but might adapt to other methods']


In [None]:
conversations = []
i = 0
for student_i, topic_i in tqdm(random_indices):
    stats = {"task_name": "education", "conversation": [], "rounds": 0}
    topic = conversation_prompts[topic_i]['background_info']['topic']
    # print(conversation_prompts[teacher_i]['background_info'].keys())
    stats = generate_conversation(conversation_prompts[student_i]['background_info'],
                                  topic)
    stats['model'] = config['agent1_model']
    stats['model2'] = config['agent2_model']
    stats['index'] = i
    stats['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    i += 1
    conversations.append(stats)

with open(f"data/education/conversations_test.json", 'w') as f:
    json.dump(conversations, f, indent=4)

# Generate conversations using Student, Teacher, Topic

In [13]:
np.random.seed(0)
random_indices = np.random.choice(len(conversation_prompts), size=60, replace=False).astype(int).reshape(-1, 3)
random_indices

array([[3069, 1675, 6385],
       [ 543, 3213,  134],
       [5869, 1425, 1819],
       [7492, 5828, 4749],
       [6334, 5387, 6303],
       [2250, 4587, 5904],
       [5476, 2491,  276],
       [3480,  273,  162],
       [2164, 2756, 7207],
       [  39, 7103, 3197],
       [4855, 6406, 7351],
       [3790, 3501,  234],
       [3343, 6875, 2980],
       [4977, 5165, 1255],
       [2915, 2542,  257],
       [2531,  533, 1742],
       [1259, 3740, 5348],
       [6284, 1865, 6169],
       [ 574, 5434, 6662],
       [5782,  882, 1531]])

In [72]:
config['verbose'] = False
config['convo_length_limit'] = 15

In [None]:
conversations = []
i = 0
for student_i, teacher_i, topic_i in tqdm(random_indices):
    stats = {"task_name": "education", "conversation": [], "rounds": 0}
    topic = conversation_prompts[topic_i]['background_info']['topic']
    # print(conversation_prompts[teacher_i]['background_info'].keys())
    stats = generate_conversation_old(conversation_prompts[student_i]['background_info'],
                                  conversation_prompts[teacher_i]['background_info'],
                                  topic)
    stats['model'] = config['agent1_model']
    stats['model2'] = config['agent2_model']
    stats['index'] = i
    stats['timestamp'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    i += 1
    conversations.append(stats)

with open(f"data/education/conversations_test.json", 'w') as f:
    json.dump(conversations, f, indent=4)

100%|██████████| 20/20 [05:44<00:00, 17.21s/it]


In [75]:
def split_conversation(conversation, speaker1, speaker2):
    pattern = fr"({speaker1}:|{speaker2}:)(.*?)(?=({speaker1}:|{speaker2}:|$))"
    matches = re.findall(pattern, conversation, re.DOTALL)

    # Combine consecutive entries with the same speaker
    combined_entries = []
    for speaker, message, _ in matches:
        if combined_entries and combined_entries[-1].startswith(speaker):
            combined_entries[-1] += " " + message.strip()
        else:
            combined_entries.append(f"{speaker.strip()} {message.strip()}")

    return combined_entries

In [None]:
example_conversation = [
            "Teacher: Teacher: An example is how humans have evolved over time. For example, we used to have more body hair than we do today.\n\nStudent: That is very interesting. I have never heard of humans having more body hair than we do today. What is the\n",
            "Student: I'm not sure what you mean\n",
            "Teacher: It's easy to imagine because it's true. We just don't notice it because we don't see it every day.\nStudent: Well, I'm not sure if I buy that. It seems like something\n",
            "Student: That's cool! I didn't realize that cats could change their\n",
            "Teacher: If you don't look for it, you won't find it.\nStudent: I guess you're right\n",
            "Student: Because I know that cats live in forests and trees\nTeacher: Exactly! You know that cats live in forests and trees, so you would expect to see one there. Now imagine if you were walking through a desert and suddenly came across a tree. What\n",
            "Teacher: True! Sometimes things look strange even when they aren't strange. That's why we say \"if something looks strange, then it probably isn't.\" It's all about context.\nStudent: I guess so\nStudent: I don't think I'm going to get anywhere with\n",
            "Student: Alright, thanks for everything.\nTeacher: No problem!\n",
            "Teacher: Well, we're all different. I think it's great that we can have a conversation about something like this. It's very important to me to help students learn as much as they can.\nStudent: Thank you\nStudent: That's true. It is important to teach people as much as possible. But sometimes people don't want to learn. So then what do you do?\nTeacher\n",
            "Student: Bye.\n\n"
        ]

In [86]:
split_conversation(example_conversation[0], "Teacher", "Student")[0]

'Teacher: An example is how humans have evolved over time. For example, we used to have more body hair than we do today.'

In [89]:
example2_conversation = " We'll do this experientially. The first step is to observe The Great Wall of China. Student: I don't think I've been to China before, and I'm not sure what The Great Wall of China is. I might not even know what it is now. Teacher: You don't have to be there. We can do this digitally. Watch this video.Student: I'm watching the video. Teacher: <END>"

In [91]:
split_conversation(example2_conversation, "Teacher", "Student")[-1]

'Teacher: <END>'

# Evaluate Conversations based on student response

In [9]:
with open('/nfs/kun2/users/ryan_cheng/consistency_LLMs/data/education/exp/Llama-3.1-8B-Instruct_10.json', 'r') as f:
    conversation_json = json.load(f)
conversation_json[0].keys()

dict_keys(['task_name', 'P1', 'P2', 'topic', 'student_background', 'student_preference', 'student_reaction', 'conversation', 'pturn', 'index', 'timestamp', 'rounds', 'conversation_only', 'model1', 'model2', 'eval_prompt_consistency', 'P1_prompt_consistency_score', 'P2_prompt_consistency_score', 'eval_all_line_consistency', 'P1_all_line_consistency_score', 'P2_all_line_consistency_score', 'eval_prev_line_consistency', 'P1_prev_line_consistency_score', 'P2_prev_line_consistency_score', 'eval_survey_consistency', 'P1_survey_consistency_score', 'P2_survey_consistency_score', 'P1_backstory_test', 'P2_backstory_test', 'eval_pairwise_consistency', 'P1_pairwise_consistency_score', 'P2_pairwise_consistency_score'])