## Japanese-StableLM-Base-Alpha-7BのJCommonsenseQAによる評価

### 参考

- [1] https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install transformers==4.30.2 sentencepiece==0.1.99 accelerate==0.23.0 datasets==2.14.5 einops==0.6.1

In [None]:
N_SAMPLES = None
# N_SAMPLES = 100
# 一部のデータでデバッグしたい場合には上記のコメントアウトを外す

In [None]:
import numpy as np
import torch
from datasets import load_dataset
from tqdm.notebook import tqdm
from transformers import LlamaTokenizer, AutoModelForCausalLM

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(
    "novelai/nerdstash-tokenizer-v1", additional_special_tokens=["▁▁"]
)
model = AutoModelForCausalLM.from_pretrained(
    "stabilityai/japanese-stablelm-base-alpha-7b",
    trust_remote_code=True,
)
model.half()
model.eval()

if torch.cuda.is_available():
    model = model.to("cuda")

In [None]:
model.config

### データセットのダウンロード

In [None]:
dataset = load_dataset("leemeng/jcommonsenseqa-v1.1", split="validation")

if N_SAMPLES is not None:
    shuffled_dataset = dataset.shuffle(seed=42)
    dataset = shuffled_dataset.select(range(N_SAMPLES))

dataset

### 質問-回答例のサンプリング

コンテクストとして与える質問と模範回答の例を作成する。モデルの評価にはvalidation splitを利用しているのでここではtrain splitからサンプリングする。

In [None]:
train_dataset = load_dataset("leemeng/jcommonsenseqa-v1.1", split="train")

In [None]:
display(train_dataset[0])

In [None]:
sample_prompt = """質問: 主に子ども向けのもので、イラストのついた物語が書かれているものはどれ？
choice0: 世界
choice1: 写真集
choice2: 絵本
choice3: 論文
choice4: 図鑑
回答: 絵本"""

### テキストの生成

In [None]:
for i, item in tqdm(enumerate(dataset), total=dataset.num_rows):
    text = f"""質問: {item["question"]}\nchoice0: {item["choice0"]}\nchoice1: {item["choice1"]}\nchoice2: {item["choice2"]}\nchoice3: {item["choice3"]}\nchoice4: {item["choice4"]}\n解答: """
    prompt_text = f"### 例 ###\n{sample_prompt}\n\n{text}"
    print(prompt_text)
    break

In [None]:
answers = []

for i, item in tqdm(enumerate(dataset), total=dataset.num_rows):
    text = f"""質問: {item["question"]}\nchoice0: {item["choice0"]}\nchoice1: {item["choice1"]}\nchoice2: {item["choice2"]}\nchoice3: {item["choice3"]}\nchoice4: {item["choice4"]}\n解答: """
    prompt_text = f"### 例 ###\n{sample_prompt}\n\n{text}"
    prompt = tokenizer.encode(
        prompt_text,
        add_special_tokens=False,
        return_tensors="pt",
    )
    prompt_len = len(prompt[0])
    prompt = prompt.to(model.device)
    generated_tokens = model.generate(
        inputs=prompt,
        max_new_tokens=5,
        do_sample=False,
    )[0]
    generated_text = tokenizer.decode(
        generated_tokens[prompt_len:],
        skip_special_tokens=True,
    )
    answer = generated_text.split("\n")[0]
    answers.append(answer)
    # print(prompt_text, answer, "\n")

In [None]:
correct_answers = []
for item in dataset:
    choices = [item[f"choice{i}"] for i in range(5)]
    label = item["label"]
    correct_answers.append(choices[label])

In [None]:
qa_df = dataset.to_pandas()
qa_df["answer"] = answers
qa_df["correct_answer"] = correct_answers
tag = f"_sample{N_SAMPLES}" if N_SAMPLES is not None else ""
qa_df.to_csv(f"jcommonsense_plamo13b{tag}.csv")
qa_df

### 評価

In [None]:
def norm_acc(df):
    df = df.copy()
    accs = []
    for label, df_ in df.groupby("label"):
        acc = df_.apply(lambda item: item["answer"] == item["correct_answer"], axis=1).mean()
        accs.append(acc)
    return np.mean(accs), accs

In [None]:
acc, accs = norm_acc(qa_df)
print(f"norm_acc(1-shot): {100 * acc:.1f}%")
print([f"{i}: {acc * 100:.1f}%" for i, acc in enumerate(accs)])

## 誤答の例

In [None]:
error_df = qa_df.query("answer != correct_answer")

for _, item in error_df.sample(min(100, len(error_df))).iterrows():
    print(
        f"""問題: {item["question"]}
選択肢: {item["choice0"]}, {item["choice1"]}, {item["choice2"]}, {item["choice3"]}, {item["choice4"]}
正答: {item["correct_answer"]}
モデルの回答: {item["answer"]}
"""
    )