In [None]:
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
import openai
from openai import OpenAI
import time
import os
from datasets import load_dataset
from sklearn.metrics import f1_score
openai.api_key = os.environ["OPENAI_API_KEY"]
from nltk import sent_tokenize

In [None]:
# OpenAI client
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [None]:
# load dataset
CBT_V_train_df = pd.read_parquet('data/CBT_V.parquet')
CBT_P_train_df = pd.read_parquet('data/CBT_P.parquet')

In [None]:
# function to get k sample from CBT V dataset
def get_samples(k):
    returned_str = ""
    for i in range(k):
        curr_str = f"Sample{i}: " + "\n".join(list(CBT_V_train_df["sentences"].iloc[0]))
        returned_str += curr_str + '\n\n'
    return returned_str

In [None]:
# test Prompt
prompt = f"""
We're studying how neurons in a neural network affect the model's performance on specific tasks. Each neuron looks for some particular thing in a short document. To measure how neurons are related to the given task, we want to know what concepts are important for the task. 

Neuron concepts examples:
1. the past and present tense of the verb "to be" (was, were, is).
2. variations of the verb 'be'.
3. modal verbs, especially "would" and "were".
4. action verbs related to starting or beginning.
5. future tense verbs and words related to commitment.
6. the usage of the verb "to be" and its conjugations.
7. the verb 'use' and its variations.
8. the word "could" and similar auxiliary verbs indicating possibility.
9. the word "like" and its variations, as well as other verbs expressing desire or interest.
10. verbs related to posting and sharing information.

Given the input samples below:

{get_samples(10)}

List a comprehensive list of categories of concepts that are important for language models to comprehend the given texts. Output in the following format:
1. concept1
2. concept2
...
"""

In [None]:
print(prompt)

In [None]:
response = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="gpt-4",
        )

In [None]:
def convert_to_list(input_str):
    lines = input_str.split("\n")
    result_list = []
    for line in lines:
        item = line.split('.', 1)[1].strip() if '.' in line else line.strip()
        result_list.append(item)
    return result_list

In [None]:
concept_list = convert_to_list(response.choices[0].message.content)

In [None]:
# generated concepts
for i in range(len(concept_list)):
    print(f"{i}. {concept_list[i]}")

In [None]:
# get text embedding
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
# get embedding list
embedding_list = []
for e in concept_list:
    embedding_list.append(get_embedding(e))

In [None]:
# dump embedding list
with open('data/CBT_V_concepts_embedding.pkl', 'wb') as file:
    pickle.dump(np.array(embedding_list), file)