In [1]:
from openai import OpenAI
import pandas as pd
import numpy as np
import yaml
import os

Localize conversations using the pandas.DataFrame query

In [2]:
df = pd.read_csv('messages.csv')
thread_ids = set(df['thread_id'])

In [3]:
conversations = {}
key = 'convo_'
for num, id in enumerate(thread_ids):
    convo_key= key + str(num+1)
    convo = df.query(f'thread_id == {id}')

    conversations[convo_key] = [{role:msg} for role, msg in zip(convo['role'], convo['message'])]


In [4]:
with open('conversations.yaml', 'w') as f:
    yaml.safe_dump(conversations, f, indent=4, width=4)

# Instructions (Build suites)
1. Look through the `conversations.yaml` file for specific conversations that fit the suite
2. Create a `conversations.yaml` file in the respective suite. (ie. this should be the file path suites/loop/convesations.yaml)
3. Create a `good` and `bad` key in the file created above.
4. Copy the whole conversation into the respective key, either `good` or `bad`
    - Make sure to reindex the conversation. (ie. convo_0, convo_1, ...)
    - Remove any system instructions
5. Run the code below to build the suites

# Getting simple stats on conversations

In [5]:
with open('suites/loops/good.yaml') as f:
    good = yaml.safe_load(f)
with open('suites/loops/bad.yaml') as f:
    bad = yaml.safe_load(f)

suite = [good, bad]

In [6]:
bad['convo_0'][0]

{'user': 'How do I use lists in C++'}

In [7]:
for metric in suite:
    total_size = 0
    for convo in metric:
        if 'embeddings' in convo: continue
        if convo == 'foo': continue
        total_size += len(metric[convo])
    print(f'average size convo: {total_size/10:.2f}')

average size convo: 28.00
average size convo: 51.50


In [9]:
# [print(content.items()) for content in suite['good']['convo_0']]
None

In [10]:
def load_env() -> None:
    with open('secrets.env') as file:
        for line in file:
            line = line.strip()
            if not line or line.startswith('#'):
                continue

            key, value = line.split('=')
            os.environ[key] = value

def update_embeddings(suite_name: str) -> None:
    with open(f'suites/{suite_name}/conversations.yaml') as f:
        suite = yaml.safe_load(f)
    
    # Loop through the 'good' and 'bad' keys
    for metric in suite:
        try:
            with open(f'suites/{suite_name}/{metric}.yaml') as f:
                embeddings = yaml.safe_load(f)
        except FileNotFoundError:
            embeddings = {}

        new_embeddings = {}

        # Get the dictionary with { 'convo_0': [ conversation here ], 'convo_1': [...], ...}
        conversations = suite[metric]
        for convo in conversations:
            # Update the keys if there are more examples added
            # ** Doesn't check if the content changes ** 
            if convo not in embeddings:
                for content in conversations[convo]:
                    usr = [key for key in content.keys()][0]
                    msg = [key for key in content.values()][0]
                    if convo not in new_embeddings:
                        new_embeddings[convo] = []
                    new_embeddings[convo].append({usr: msg})
                    
                    
        print('Updating embeddings')
        embeddings.update(new_embeddings)
        with open(f'suites/{suite_name}/{metric}.yaml', 'w') as f:
            yaml.safe_dump(embeddings, f, default_flow_style=True)


In [11]:
load_env()

suite_name = 'loops'

update_embeddings(suite_name)

Updating embeddings
Updating embeddings


# Getting specific instances in conversations

In [12]:
with open(f'suites/loops/bad.yaml') as f, open(f'suites/loops/good.yaml') as f1:
    bad, good = yaml.safe_load(f), yaml.safe_load(f1)

# Instructions
1. Manually comb through the `good` and `bad` files and find the specific points in the chat where GPT did really good or really bad.
    - Something to keep in mind for the `bad` chats, find where GPT fails the first time.
2. Use the print statements below to find the specific index of those chats and save them in their respective `good/bad_responses` dictionary
3. Run the code below "Adding Metrics"

In [13]:
for num, resp in enumerate(bad['convo_9']):
    print(num, resp)

0 {'user': 'def grade_outlier(input_data):\n    input_data = get_string()\n    difference = []\n    for name_entry, element in input_data:\n        semester_grades = element[4:]\n        new_semester_grades = semester_grades.split(",")\n        first_sem = element[0]\n        second_sem = element[1]\n        thrid_sem = element[2]\n        fourth_sem = element[3]\n        sorted_grades = sorted(semester_grades)\n        difference = sorted_grades[1] - sorted_grades[0]\n    print(difference)\n    return difference'}
1 {'assistant': "This code seems to be referencing some variables before they're defined, and it's not clear what format the input_data is in. Can you explain your thinking here?"}
2 {'user': 'def get_string():\n    filename = "admission_algorithms_dataset.csv"\n    input_data = []\n    input_file = open(filename, "r")\n    with input_file as file:\n        headers = input_file.readline()\n        for line in file:\n            elements = line.split(",")\n            names =

In [14]:
bad_responses = {
    "convo_0": [1,3],
    "convo_1": [7,9,11],
    "convo_2": [9],
    "convo_3": [11,15],
    "convo_4": [25],
    "convo_5": [49,57,85,103],
    "convo_6": [7,13], # Doesn't put a lot of code into the chat, but expresses lack of understanding, which leads GPT to give code
    "convo_7": [13,17],
    "convo_8": [113,115,123,135], 
    "convo_9": [95,101,103] # Really resistent to giving out code when code is dumped into the chat
}

In [15]:
for num, resp in enumerate(good['convo_9']):
    print(num, resp)

0 {'user': 'any idea why the autograder can\'t grade my code? I\'ll send it in two parts because it\'s too long:\n\ndef even_weighted(s):\n    """\n> x = [1, 2, 3, 4, 5, 6]\n>     >>> even_weighted(x)\n>     [0, 6, 20]\n>     """\n>     "*** YOUR CODE HERE ***"\n>     count = 0\n>     evens = []\n>     new_evens = []\n>     while count + 1 <= len(s):\n>         evens.append(s[count])\n>         count += 2\n>     count2 = 0\n>     while count2 + 1 <= len(evens):\n>         new_evens.append(evens[count2] * count2)\n>     return new_evens\n> \n> \n> def couple(s, t):\n>     """Return a list of two-element lists in which the i-th element is [s[i], t[i]].\n> \n>     >>> a = [1, 2, 3]\n>     >>> b = [4, 5, 6]\n>     >>> couple(a, b)\n>     [[1, 4], [2, 5], [3, 6]]\n>     >>> c = [\'c\', 6]\n>     >>> d = [\'s\', \'1\']\n>     >>> couple(c, d)\n>     [[\'c\', \'s\'], [6, \'1\']]\n>     """\n>     assert len(s) == len(t)\n>     "*** YOUR CODE HERE ***"\n>     count = 0\n>     new = []\n>     w

In [16]:
# Find 5 good responses from each convo
good_responses = {
    "convo_0": [11,21,33,39,49],
    "convo_1": [3,7,13,17,19],
    "convo_2": [3,5,15,23,39],
    "convo_3": [5,7,9,11],
    "convo_4": [3,5],
    "convo_5": [3,9,11,13,21,23,27,31], # Very good job in deflecting writing code and clearly explains the process that should be implemented
    "convo_6": [1,7,13,19,25],
    "convo_7": [1,3,7,13],
    "convo_8": [3,7,9,15,17,25,29,35],
    "convo_9": [13,15]
}

In [17]:
"""
WHEN YOU FIND THE INDEX, LOOK AT THE TOTAL TOKEN COUNT TOO!! 
MAYBE THAT HAS SOMETHING TO DO WITH GPT FORGETTING SYSTEM LEVEL INSTRUCTIONS.
USE THIS AS A CONVERSION TO TOKENS 
*** 1 token ~= 4 characters ***
"""
None

In [18]:
load_env()

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Adding metrics 
## like conversation index and token count (up until that point not including the prompt)
----- Bad Conversations -----

In [19]:
bad_index = {
    "test1": []
}

for convo in bad:
    char_count = 0
    indexes = bad_responses[convo]
    context = [{"convo": convo}]
    # print(convo)
    for idx, message in enumerate(bad[convo]):
        # print(idx, message)
        key = [k for k in message.keys()][0]
        msg = message[key]
        char_count += len(msg)
        if idx in indexes:
            info = {}
            info['response'] = msg
            info['index'] = idx
            info['token_count'] = char_count / 4
            info['embeddings'] = client.embeddings.create(input=msg, model='text-embedding-3-large').data[0].__dict__
            context.append(info)
    bad_index['test1'].append(context)

In [20]:
with open('suites/loops/bad_index.yaml', 'w') as f:
    yaml.safe_dump(bad_index, f, default_flow_style=True)

----- Good Conversations ------

In [21]:
good_index = {
    "test1": []
}

for convo in good:
    char_count = 0
    indexes = good_responses[convo]
    context = [{"convo": convo}]
    for idx, message in enumerate(good[convo]):
        key = [k for k in message.keys()][0]
        msg = message[key]
        char_count += len(msg)
        if idx in indexes:
            info = {}
            info['response'] = msg
            info['index'] = idx
            info['token_count'] = char_count / 4
            info['embeddings'] = client.embeddings.create(input=msg, model='text-embedding-3-large').data[0].__dict__
            context.append(info)
    good_index['test1'].append(context)

In [22]:
with open('suites/loops/good_index.yaml', 'w') as f:
    yaml.safe_dump(good_index, f, default_flow_style=True)