In [46]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM


model_name = "stabilityai/japanese-stablelm-3b-4e1t-instruct" # stabilityai/japanese-stablelm-instruct-gamma-7b

tokenizer = AutoTokenizer.from_pretrained(model_name)
if model_name == "stabilityai/japanese-stablelm-3b-4e1t-instruct":
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.half
    )
model.eval()

if torch.cuda.is_available():
    model = model.to("cuda")

def build_prompt(user_query, inputs="", sep="\n\n### "):
    # sys_msg = "あなたは国語教師です。以下の問題を考え、正しい選択肢を説明してください。"
    sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
    p = sys_msg
    roles = ["指示", "応答"]
    msgs = [": \n" + user_query, ": \n"]
    if inputs:
        roles.insert(1, "入力")
        msgs.insert(1, ": \n" + inputs)
    for role, msg in zip(roles, msgs):
        p += sep + role + msg
    return p


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
configuration_stablelm_epoch.py: 100%|██████████| 5.27k/5.27k [00:00<00:00, 2.64MB/s]
A new version of the following files was downloaded from https://huggingface.co/stabilityai/japanese-stablelm-3b-4e1t-instruct:
- configuration_stablelm_epoch.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
modeling_stablelm_epoch.py: 100%|██████████| 27.8k/27.8k [00:00<00:00, 13.9MB/s]
A new version of the following files was downloaded from https://huggingface.co/stabilityai/japanese-stablelm-3b-4e1t-instruct:
- modeling_stablelm_epoch.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
model.safetensors: 100%|██████████| 5.59G/5.59G [09:05<00:00, 10.2MB/s]
generation_config.json: 

In [55]:
N2 = pd.read_csv('./GrammarDataset/N2_grammar_cleaned.csv')
N2

Unnamed: 0,Question,Options,Answer
0,___とたん、眠くなった。,a.勉強が終わった b.勉強をした c.勉強をしていた,a
1,弟は、やっと見つけた就職先なのに、___のうちに、もう辞めてしまった。,a.仕事を覚えたか覚えないか b.働いているかいないか c.友達がいるかいないか,a
2,この料理、熱いうちに___。,a.おいしいですよ b.召し上がってください c.いい香りいがします,b
3,交通費は___一方だ。,a.値上がりする b.値上がりしている c.値上がりした,a
4,___としています。,a.間も無く夏が終わろう b.今日は雨が降ろう c.今年の冬は寒くなろう,a
...,...,...,...
124,まだたっぷり時間があったのだから、あんなに___。,a.急ぐことはなかった b.急ぐものではなかった c.急ぐはずがなかった d.急ぐよりほかなかった,a
125,彼のちょっとした態度だけで自分が嫌われていると思うなんて、___。,a.考えすぎるものだ b.考えすぎというものだ c.考えすぎたものだ d.考えすぎというもの...,b
126,残念ですが、これだけ結果が悪ければ、この計画は失敗だと___。,a.言うものだ b.言ったところだ c.言わないことはない d.言わざるを得ない,c
127,留学生には日本語だけでなく、日本の文化や社会のことも___。,a.学ぶものだ b.学ばせるものだ c.学びたいものだ d.学んでほしいものだ,d


In [56]:
def generate_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", add_special_tokens=True)
    if torch.cuda.is_available():
        tokens = tokens.to('cuda')
    with torch.no_grad():
        embeddings = model(**tokens).logits
    return embeddings.cpu().mean(dim=1).squeeze(0)

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


def ask_sensei(question, options, evaluate=True):
    question = question.replace("___", "[MASK]")
    # Infer with prompt without any additional input
    user_inputs = {
        "user_query":'まず、文法を基づいて、最もよい選択肢のアルファベットを一つ選びなさい。次に、文法について、十五字以内で説明しなさい。',
        "inputs": f"問題：{question} 選択肢：{options}"
    }

    prompt = build_prompt(**user_inputs)

    # print(prompt)
    input_ids = tokenizer.encode(
        prompt, 
        add_special_tokens=True, 
        return_tensors="pt"
    )

    attention_mask = torch.ones_like(input_ids).to('cuda')  # Create attention mask with all 1s for non-padding tokens

    tokens = model.generate(
        input_ids.to(device=model.device),
        attention_mask=attention_mask, 
        max_new_tokens=64,
        # temperature=0.1,
        repetition_penalty=1.1,
        # top_p=0.95,
        do_sample=False,
    )
    out = tokenizer.decode(tokens[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

    if evaluate:
        option_embeddings = [generate_embedding(option) for option in options]
        response_embedding = generate_embedding(out)
        for index, option_emb in enumerate(option_embeddings):
            print(f"Cosine similarity with option {chr(index+97)} is {cosine_similarity(option_emb, response_embedding)}")

    print(out)
    print("------------")
    return out

In [57]:
sensei_ans = []
for index, row in N2.iterrows():
    answer = ask_sensei(row.Question, row.Options)
    sensei_ans.append(answer)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Cosine similarity with option a is 0.7913497090339661
Cosine similarity with option b is 0.8475881218910217
Cosine similarity with option c is 0.9011911749839783
Cosine similarity with option d is 0.909726083278656
Cosine similarity with option e is 0.8833911418914795
Cosine similarity with option f is 0.8619307279586792
Cosine similarity with option g is 0.8143980503082275
Cosine similarity with option h is 0.8087351322174072
Cosine similarity with option i is 0.9255694150924683
Cosine similarity with option j is 0.8652255535125732
Cosine similarity with option k is 0.8731501698493958
Cosine similarity with option l is 0.8475881218910217
Cosine similarity with option m is 0.9011911749839783
Cosine similarity with option n is 0.909726083278656
Cosine similarity with option o is 0.8527827858924866
Cosine similarity with option p is 0.8567222952842712
Cosine similarity with option q is 0.9255694150924683
Cosine similarity with option r is 0.8652255535125732
Cosine similarity with option 

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Cosine similarity with option a is 0.784991443157196
Cosine similarity with option b is 0.8525782227516174
Cosine similarity with option c is 0.9070346355438232
Cosine similarity with option d is 0.8580439686775208
Cosine similarity with option e is 0.8610917925834656
Cosine similarity with option f is 0.8699900507926941
Cosine similarity with option g is 0.8754638433456421
Cosine similarity with option h is 0.9264522790908813
Cosine similarity with option i is 0.8751600980758667
Cosine similarity with option j is 0.8699900507926941
Cosine similarity with option k is 0.8754638433456421
Cosine similarity with option l is 0.8566635251045227
Cosine similarity with option m is 0.8750452995300293
Cosine similarity with option n is 0.8751600980758667
Cosine similarity with option o is 0.8732742667198181
Cosine similarity with option p is 0.8601803183555603
Cosine similarity with option q is 0.8525782227516174
Cosine similarity with option r is 0.915544867515564
Cosine similarity with option 

In [19]:
N2['Answer'] = sensei_ans