In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
import re
from datasets import load_dataset, load_from_disk
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()


device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


In [2]:
# student model

student_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") # Google also released Gemma 2 - 2b, use the following id for the newer model: "google/gemma-2-2b-it"
student_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    torch_dtype=torch.float32
).to(device)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [3]:
student_tokenizer.add_special_tokens({"pad_token":"<pad>"})
student_model.generation_config.pad_token_id = student_tokenizer.pad_token_id

In [11]:
# Testing cell
inputs = student_tokenizer('The quadratic formula is', return_tensors="pt").to(device)



outputs = student_model.generate(**inputs, max_new_tokens=50, do_sample=False, truncate=True, eos_token_id=50256, ).to(device)
output_answer = student_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
output_answer


'The quadratic formula is a very useful tool for solving problems involving the quadratic equation. It is also used to solve problems involving the cubic equation.\n\nThe quadratic formula is a very useful tool for solving problems involving the quadratic equation. It is'

In [18]:
t = student_tokenizer('The quadratic formula is a very useful tool for solving problems involving the quadratic equation. It is also used to solve problems involving the cubic equation.\n\nThe quadratic formula is a very useful tool for solving problems involving the quadratic equation. It is also used to solve problems involving the cubic equation.\n\nThe quadratic formula is a very useful tool for solving problems involving the quadratic equation. It is also used to solve problems involving the cubic equation.\n\nThe quadratic formula is a very useful tool for solving problems involving the quadratic equation. It is also used to solve problems involving the cubic equation.\n\nThe quadratic formula is a very useful tool for solving problems involving the quadratic equation.', return_tensors="pt")
len(t['input_ids'][0].tolist())

150

In [4]:
gsm8k = load_dataset("openai/gsm8k", "main")
gsm8k_test = gsm8k['test'].to_pandas()
gsm8k_train = gsm8k['train'].to_pandas()

In [5]:
def save_df(df : pd.DataFrame, model : str, dist_folder : str, edition : str):
    base_path = Path()
    base_path = base_path / '..' / 'logs' / model / dist_folder / f'{edition}.csv'
    df.to_csv(base_path)

def assess_model(df, model : str, dist_folder : str, subset : str, prompt_function, tokenizer, generator, max_tokens = 50, start = 0, end = None):
    numrows = df.shape[0]
    if not end:
        end = numrows
    i = 0
    df['model_prompt'] = ''
    df['model_answer'] = ''
    df['model_output_tokens'] = 0
    df['model_max_tokens'] = False
    for i, row in df[start:end].iterrows():
        df.loc[i, 'model_prompt'] = 
        tokens = tokenizer(row['question'])
        val, out_tokens = generator(tokens, max_tokens)
        out_token_count = len(out_tokens[0].tolist()) - len(tokens['input_ids'][0].tolist())
        df.loc[i, 'model_answer'] = val
        df.loc[i, 'model_output_tokens'] = out_token_count
        df.loc[i, 'model_max_tokens'] = True if out_token_count == max_tokens else False
        i += 1
        if i % 5 == 0:
            print(f'Completed : {i} out of {numrows}')
            print(f'Completed : {round(i/numrows, 4) * 100}%')
            print('\n')
        if i % 10 == 0:
            print(f'Completed : {i} out of {numrows}')
            save_df(df, model, dist_folder, f'intermediate_{i}')
    save_df(df, model, dist_folder, f'FINAL_Completed({start} : {end})')


In [6]:
def default_prompt(question):
    return question + '\n Please respond with just the answer. The answer is:'

In [7]:
def student_tokenizerd(prompt):
    tokens = student_tokenizer(prompt, return_tensors="pt").to(device)
    return tokens

In [8]:
def student_generation(tokens, max_tokens=50):
    outputs = student_model.generate(**tokens, max_new_tokens=max_tokens, do_sample=False).to(device)
    output_answer = student_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return output_answer, outputs

In [24]:
hi, to = student_generation(student_tokenizerd(default_prompt(gsm8k_test['question'][6])), 50)

In [9]:
assess_model(gsm8k_test, 'gemma2b-it', 'no-distil-test', 'test', student_tokenizerd, student_generation, 300)

TypeError: transformers.generation.utils.GenerationMixin.generate() argument after ** must be a mapping, not str

# Load data

In [2]:
def load_disk(model : str, dist_folder : str, edition : str = 'FINAL'):
    base_path = Path()
    base_path = base_path / '..' / 'logs' / model / dist_folder / edition
    return load_from_disk(base_path)

In [3]:
gsm8k_data = load_disk('gpt2-xl', 'no-distil-test', 'intermediate_5')

In [4]:
gsm8k_data['test']

Dataset({
    features: ['question', 'answer'],
    num_rows: 1319
})