In [1]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from torch.utils.data import DataLoader, Dataset
# import torch
import pandas as pd
import openai
import re
from datasets import load_dataset, load_from_disk
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
import os
import tiktoken

# device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
 #print(device)

In [2]:
client = openai.OpenAI()

In [9]:
resp = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "derive the quadratic formula."}],
)

In [12]:
stream.choices[0].message.content

'To derive the quadratic formula, we start with the general form of a quadratic equation:\n\n\\[\nax^2 + bx + c = 0\n\\]\n\nwhere \\(a\\), \\(b\\), and \\(c\\) are constants, and \\(a \\neq 0\\). We want to solve for \\(x\\). The first step is to move the constant term to the other side of the equation:\n\n\\[\nax^2 + bx = -c\n\\]\n\nNext, we will complete the square for the left-hand side. To do this, we first divide the entire equation by \\(a\\) to make the coefficient of \\(x^2\\) equal to 1:\n\n\\[\nx^2 + \\frac{b}{a}x = -\\frac{c}{a}\n\\]\n\nNow, we need to complete the square. The term needed to complete the square involves taking half of the coefficient of \\(x\\) (which is \\(\\frac{b}{a}\\)), squaring it, and adding it to both sides. Half of \\(\\frac{b}{a}\\) is \\(\\frac{b}{2a}\\), and squaring this gives:\n\n\\[\n\\left(\\frac{b}{2a}\\right)^2 = \\frac{b^2}{4a^2}\n\\]\n\nWe add this term to both sides:\n\n\\[\nx^2 + \\frac{b}{a}x + \\frac{b^2}{4a^2} = -\\frac{c}{a} + \\fra

In [3]:
gsm8k = load_dataset("openai/gsm8k", "main")
gsm8k_test = gsm8k['test'].to_pandas()
gsm8k_train = gsm8k['train'].to_pandas()

In [4]:
def prompts(df, model : str, dist_folder : str, subset : str, prompt_function, start, end):
    df['model_prompt'] = ''
    for i, row in df[start:end].iterrows():
        df.loc[i, 'model_prompt'] = prompt_function(row['question'])
        
        

In [36]:
from async_client import batch_file
def open_batch(df, batch_name : str, model : str, dist_folder : str, subset : str):
    messages = []
    for i, row in df.iterrows():
        p = row['model_prompt']
        msg = {
                'messages' : [
                    {
                        'role' : 'system',
                        'content' : ''
                    },
                    {
                        'role' : 'user',
                        'content' : p
                    }
                    ]
                }
        messages.append(msg)
    
    file_path = batch_file(batch_name, model, messages, client)
    return file_path
        

In [9]:
def save_df(df : pd.DataFrame, model : str, dist_folder : str, edition : str):
    base_path = Path()
    base_path = base_path / '..' / '..' / 'logs' / model / dist_folder / f'{edition}.csv'
    df.to_csv(base_path)

def assess_model(df, model : str, dist_folder : str, subset : str, prompt_function, model_respond, max_tokens = 50, start = 0, end = None):
    numrows = df.shape[0]
    if not end:
        end = numrows
    i = 0
    df['model_answer'] = ''
    df['model_output_tokens'] = 0
    df['model_max_tokens'] = False
    for i, row in df[start:end].iterrows():

        # tokens = tokenizer(row['question'])
        # val, out_tokens = generator(tokens, max_tokens)
        
        out_token_count = len(out_tokens[0].tolist()) - len(tokens['input_ids'][0].tolist())
        df.loc[i, 'model_answer'] = val
        df.loc[i, 'model_output_tokens'] = out_token_count
        df.loc[i, 'model_max_tokens'] = True if out_token_count == max_tokens else False
        i += 1
        if i % 5 == 0:
            print(f'Completed : {i} out of {numrows}')
            print(f'Completed : {round(i/numrows, 4) * 100}%')
            print('\n')
        if i % 10 == 0:
            print(f'Completed : {i} out of {numrows}')
            save_df(df, model, dist_folder, f'intermediate_{i}')
    save_df(df, model, dist_folder, f'FINAL_Completed({start} : {end})')


In [6]:
def default_prompt(question):
    return question + '\n Please respond with just the answer. The answer is:'

In [9]:
def student_tokenizerd(prompt):
    tokens = student_tokenizer(prompt, return_tensors="pt").to(device)
    return tokens

In [10]:
def student_generation(tokens, max_tokens=50):
    outputs = student_model.generate(**tokens, max_new_tokens=max_tokens, do_sample=False).to(device)
    output_answer = student_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return output_answer, outputs

In [24]:
hi, to = student_generation(student_tokenizerd(default_prompt(gsm8k_test['question'][6])), 50)

In [7]:
prompts(gsm8k_test, 'gpt-4o-mini', 'no-distil-test', 'test', default_prompt, 0, 1319)

In [10]:
save_df(gsm8k_test, 'gpt-4o-mini', 'no-distil-test', 'prompts')

In [38]:
open_batch(gsm8k_test, 'init_assessment-gpt4o-mini-distill', 'gpt-4o-mini', '', '')

PosixPath('../logs/gpt-4o-mini/init_assessment-gpt4o-mini-distill.jsonl')

In [None]:
assess_model(gsm8k_test, 'gemma2b', 'no-distil-test', 'test', student_tokenizerd, student_generation, 300)

# Load data

In [2]:
def load_disk(model : str, dist_folder : str, edition : str = 'FINAL'):
    base_path = Path()
    base_path = base_path / '..' / 'logs' / model / dist_folder / edition
    return load_from_disk(base_path)

In [3]:
gsm8k_data = load_disk('gpt2-xl', 'no-distil-test', 'intermediate_5')

In [4]:
gsm8k_data['test']

Dataset({
    features: ['question', 'answer'],
    num_rows: 1319
})