In [None]:
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0
!pip install accelerate peft bitsandbytes transformers trl evaluate sacrebleu sentencepiece wandb

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=config,
    device_map="auto",
)

In [None]:
from datasets import load_dataset

ds = load_dataset("openai/gsm8k", "main")

In [None]:
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])

test_df

In [None]:
import re

def get_val(text):
    after_split = text.strip().split(' ')
    i = -1
    while True:
        final_number = re.sub(r'[^\d]', '', after_split[i])
        if final_number != '':
            break
        i-=1
    
    return final_number

text = 'mob #### $18,26 mob'
print(get_val(text))

In [None]:
def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx]['question'] + '\n'
    if include_answer:
        answer = df.iloc[idx]['answer']
        prompt += answer
    return prompt

def gen_prompt(df, idx):
    sys_prompt = "You are a helpful assistant!"
    # 5-shot prompting
    prompt = "The following are 5 example math questions. Follow the instructions from these examples to answer the final question:\n"
    for i in range(5):
        prompt += format_example(train_df, i)
        prompt += "\n"
    prompt += "Now answer this following question:\n"
    prompt += format_example(df, idx, include_answer=False)
#     print(prompt)
    
    messages = [
      {"role": "system", "content": sys_prompt},
      {"role": "user", "content": prompt}
    ]

    return messages

# gen_prompt(test_df, 0)

In [None]:
from tqdm import tqdm

def evaluate(df):
    predicts = []
    labels = []
    corr = []
    for i in tqdm(range(df.shape[0])):
        message = gen_prompt(df, i)
        
        input_ids = tokenizer.apply_chat_template(
            message,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
        
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]

        outputs = model.generate(
            input_ids,
            max_new_tokens=512,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.1,
            top_p=0.1,
        )
        response = outputs[0][input_ids.shape[-1]:]
        
        answer = tokenizer.decode(response, skip_special_tokens=True)
        
        predict = get_val(answer)
        label = get_val(df.iloc[i]['answer'])
        predicts.append(predict)
        labels.append(label)
        
        cor = (predict == label)
        corr.append(cor)
    
    accuracy = np.mean(corr)
        
    return accuracy

evaluate(test_df)