In [None]:
from openai import OpenAI
from core.config import *
from tqdm import tqdm, trange
import jsonlines
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# batched using threads
from concurrent.futures import ThreadPoolExecutor

# Reader utils

In [None]:
def read_jsonl(file_path):
    with jsonlines.open(file_path) as reader:
        data = [obj for obj in reader]
    return data

def get_datasets(meta, topic, base='Mistral-7B-Instruct-v0.2'):
    if meta == 'mmlu':
        train_path = f'datasets/mmlu/{topic}/{base}-train.jsonl'
        test_path = f'datasets/mmlu/{topic}/{base}-test.jsonl'

        train_ds, test_ds = read_jsonl(train_path), read_jsonl(test_path)

    elif meta == 'anthropic':
        train_path = f'datasets/anthropic-eval/{topic}/{base}-train.jsonl'
        test_path = f'datasets/anthropic-eval/{topic}/{base}-test.jsonl'

        train_ds, test_ds = read_jsonl(train_path), read_jsonl(test_path)
    
    else:
        raise ValueError(f"Unknown meta: {meta}")

    return train_ds, test_ds

no_to_letter = {0: 'A', 1: 'B', 2: 'C', 3: 'D'} #  0-3: A-D, ALL OTHERS: N

def format_single_entry_mmlu(entry):
    q, choice = entry['question'], entry['choices']

    choice_str = '\n'.join([f'{no_to_letter[i]}: {c}' for i, c in enumerate(choice)])

    return f'Question: {q}\nChoices:\n{choice_str}'

def format_single_entry_anthropic(entry):
    q = entry['question']
    return q

def get_embedding_strs(ds, meta):
    if meta == 'mmlu':
        return [format_single_entry_mmlu(entry) for entry in ds]
    elif meta == 'anthropic':
        return [format_single_entry_anthropic(entry) for entry in ds]
    else:
        raise ValueError(f"Unknown meta: {meta}")
    
def preprocess_ds(meta, topic, base='Mistral-7B-Instruct-v0.2'):
    train_ds, test_ds = get_datasets(meta, topic, base)
    train_strs, test_strs = get_embedding_strs(train_ds, meta), get_embedding_strs(test_ds, meta)
    return train_strs, test_strs


In [None]:
def pack_jsonl(questions, emb):
    return [{'question': q, 'embedding': e} for q, e in zip(questions, emb)]

def save_jsonl(file_path, file_content):
    with jsonlines.open(file_path, 'w') as writer:
        for line in file_content:
            writer.write(line)
    return file_path

def save_npz(file_path, file_content):
    np.savez(file_path, file_content)
    return file_path

def save_embeddings(train_emb, test_emb, meta, topic, format='npz'):
    if meta == 'mmlu':
        train_path = f'datasets/mmlu/{topic}/embedding-train.jsonl'
        test_path = f'datasets/mmlu/{topic}/embedding-test.jsonl'

    elif meta == 'anthropic':
        train_path = f'datasets/anthropic-eval/{topic}/embedding-train.jsonl'
        test_path = f'datasets/anthropic-eval/{topic}/embedding-test.jsonl'
    
    else:
        raise ValueError(f"Unknown meta: {meta}")

    if format == 'npz':
        train_path = train_path.replace('.jsonl', '.npz')
        test_path = test_path.replace('.jsonl', '.npz')
        save_npz(train_path, train_emb)
        save_npz(test_path, test_emb)

    elif format == 'jsonl':
        save_jsonl(train_path, train_emb)
        save_jsonl(test_path, test_emb)
    else:
        raise ValueError(f"Unknown format: {format}")
    
    print(f"Saved embeddings to {train_path} and {test_path}")

# Embedding

In [None]:
def get_embedding(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    client = OpenAI(api_key=OPENAI_API_KEY)
    emb = client.embeddings.create(input = [text], model=model).data[0].embedding
    del client
    return emb

In [None]:
def batched_embedding(texts, model="text-embedding-3-large"):
    with ThreadPoolExecutor(max_workers=10) as executor:
        embeddings = list(executor.map(lambda x: get_embedding(x, model), texts))
    return embeddings

# Plotting

In [None]:
# Tsne

def plot_tsne(embeddings, labels, topic, meta):
    tsne = TSNE(n_components=2, random_state=0)
    embeddings_2d = tsne.fit_transform(embeddings)

    plt.figure(figsize=(6, 5))
    # use blues 
    cmap = plt.cm.get_cmap('Blues', 4)

    # label = 0 train label = 1 test, plot using different color (blue, orange)
    for i in range(2):
        indices = np.where(labels == i)
        plt.scatter(embeddings_2d[indices, 0], embeddings_2d[indices, 1], c=cmap(i), label=f'{i} set', s=10)

    # plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap=cmap, s=10)
    plt.legend()
    plt.title(f't-SNE visualization of {meta} dataset for {topic} topic')
    ds_type = 'mmlu' if meta == 'mmlu' else 'anthropic-eval'
    path = f'datasets/{ds_type}/{topic}/tsne.jpg'
    plt.savefig(path, dpi=600)
    # plt.show()

# Setup

In [None]:
meta = 'mmlu'
topics = [# 'machine_learning',
          # 'high_school_physics',
          # 'high_school_world_history',
          # 'high_school_chemistry',
          # 'college_biology',
          'high_school_mathematics'
          ]

In [None]:
def sample_embedding(meta, topic):
    train_strs, test_strs = preprocess_ds(meta, topic)
    train_emb, test_emb = np.array(batched_embedding(train_strs)), np.array(batched_embedding(test_strs))
    save_embeddings(train_emb, test_emb, meta, topic, format='npz')
    plot_tsne(np.concatenate([train_emb, test_emb], axis=0), [0]*(len(train_emb)) + [1]*len(test_emb), topic, meta)

In [None]:
for topic in tqdm(topics):
    sample_embedding(meta, topic)

In [None]:
import os

# Define the directory containing the files
directory = 'datasets/openend/_raw/test'  # Current directory. Modify as needed.

# Loop through each file in the directory
for filename in os.listdir(directory):
    if "dataset_" in filename:
        # Remove the prefix 'datasets_'
        new_filename = filename[len("dataset_"):]
        # Construct full old and new file paths
        old_file = os.path.join(directory, filename)
        new_file = os.path.join(directory, new_filename)
        # Rename the file
        os.rename(old_file, new_file)
        print(f'Renamed "{filename}" to "{new_filename}"')
