In [1]:
import os
import random
from typing import List
import pandas as pd
import numpy as np
import argparse
import torch
import datasets
from tqdm import tqdm
from transformers.trainer_utils import set_seed
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
from transformers.generation import GenerationConfig
from datasets import load_dataset, load_from_disk


class args:
    checkpoint_path = '/gemini/code/lamma3_eval/lamma3_model/8B'
    eval_data_path = '/gemini/code/lamma3_eval/eval_data/gsm8k'
    save_result_dir = "/gemini/code/lamma3_eval/eval_result/gsm8k"
    # choices = ["A", "B", "C", "D"]
    debug = False
    overwrite = False
    batch_size = 20

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = datasets.load_dataset("gsm8k",'main')
# dataset.save_to_disk(args.eval_data_path)

dataset = load_from_disk(args.eval_data_path)

In [3]:
dataset['test'][0]

{'question': "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
 'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}

In [4]:
def load_models_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(
        args.checkpoint_path,
        # padding_side='left'
    )

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4")
    
    model = AutoModelForCausalLM.from_pretrained(
        args.checkpoint_path,
        device_map="auto",
        quantization_config=quantization_config
    ).eval()
    model.generation_config = GenerationConfig.from_pretrained(
        args.checkpoint_path
    )
    model.generation_config.do_sample = False  # use greedy decoding
    model.generation_config.repetition_penalty = 1.0  # disable repetition penalty
    return model, tokenizer

In [5]:
model, tokenizer = load_models_tokenizer()
tokenizer.pad_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:28<00:00,  7.23s/it]


In [119]:
fewshot_prompt = open("gsm8k_prompt.txt").read()

def doc_to_text(doc):
    return (
        fewshot_prompt
        + "\nQuestion: "
        + doc["question"]
        + "\nLet's think step by step\n"
    )

def batch_process(func, *args):
    print(f'args len: {len(args)}')
    texts = args[0]  # 需要接受 dataset 完整的一/多行
    assert type(texts) == datasets.arrow_dataset.Dataset, "dataset 需要使用 select 取一个batch!"
    
    text_ls = []
    for i in range(len(texts)):
        text_ls.append(func(texts[i]))

    return text_ls


In [162]:
def generate_sample(model, tokenizer, input_txt):
    input_ids = tokenizer(input_txt, padding=True, return_tensors="pt").to(model.device)
    # context_enc = torch.tensor([input_ids]).to(model.device)
    # print(f"Input text: {input_txt}\n")
    print(input_ids['input_ids'])
    print(input_ids['attention_mask'])

    outputs = model.generate(**input_ids, max_new_tokens = 200, eos_token_id = tokenizer.eos_token_id, pad_token_id = tokenizer.eos_token_id 
                             , repetition_penalty = 1.2, do_sample = False, temperature = 1.0, top_p = 1.0)
    print(outputs)
    return outputs

In [163]:
context = batch_process(doc_to_text, dataset['test'].select(range(25,32)))
completion = generate_sample(model, tokenizer, context)


args len: 1
tensor([[128002, 128002, 128002,  ...,    555,   3094,    198],
        [128002, 128002, 128002,  ...,    555,   3094,    198],
        [128002, 128002, 128002,  ...,    555,   3094,    198],
        ...,
        [128000,  14924,     25,  ...,    555,   3094,    198],
        [128002, 128002, 128002,  ...,    555,   3094,    198],
        [128002, 128002, 128002,  ...,    555,   3094,    198]],
       device='cuda:0')
tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]], device='cuda:0')
tensor([[128002, 128002, 128002,  ..., 128001, 128001, 128001],
        [128002, 128002, 128002,  ..., 128001, 128001, 128001],
        [128002, 128002, 128002,  ..., 128001, 128001, 128001],
        ...,
        [128000,  14924,     25,  ...,    220,   2031,   3346],
        [128002, 128002, 128002,  ...,  39835,   5161,   6927]

In [165]:
answer = tokenizer.decode(completion[-3], skip_special_tokens=False)
print(answer)

<|begin_of_text|>Question: In 2004, there were 60 kids at a cookout. In 2005, half the number of kids came to the cookout as compared to 2004. In 2006, 2/3 as many kids came to the cookout as in 2005. How many kids came to the cookout in 2006?
Let's think step by step
In 2005, 60/2=30 kids came to the cookout.
In 2006, 30/3*2=20 kids came to the cookout.
The answer is 20

Question: Zilla spent 7% of her monthly earnings on rent, half of it on her other monthly expenses, and put the rest in her savings. If she spent $133 on her rent, how much does she deposit into her savings account in a month?
Let's think step by step
Since $133 is equal to 7% of her earnings, then 1% is equal to $133/7 = $19.
The total monthly earning of Zilla is represented by 100%, so $19 x 100 = $1900 is her monthly earnings.
So, $1900/2 = $950 is spent on her other monthly expenses.
The total amount spent on the rent and other monthly expenses is $133 + $950 = $1083.
Hence, she saves $1900 - $1083 = $817 per mont

: 

In [128]:
tokenizer(['123','jjds,cs fd'], padding=True, return_tensors="pt")
# tokenizer('<|end_of_text|>')

tokenizer = AutoTokenizer.from_pretrained(
        args.checkpoint_path,
        padding_side='left'
    )
tokenizer.pad_token_id = 128002

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [48]:
dataset2 = load_from_disk("/gemini/code/lamma3_eval/eval_data/mmlu/all")
print(len(dataset2['test']))

# dataset2.select()


random.sample(range(14042),10)


14042


[3295, 6100, 5053, 5759, 13448, 12979, 6828, 753, 3567, 3889]