In [74]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
model.to("cuda:0")


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
      (1): LlamaDecoderLayer(

In [76]:
tokenizer.special_tokens_map


{'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}

In [93]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
import math
import re


# model_path = "meta-llama/Llama-3.2-1B"
model_path = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.to("cuda:0")

BOS = tokenizer.bos_token
EOS = tokenizer.eos_token

# テキスト生成
num_samples = 5
max_length = 100
temperature = 0.7
top_p = 0.9

with torch.no_grad():
    outputs = model.generate(
        input_ids=tokenizer.encode(BOS, return_tensors="pt").to("cuda:0"),
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        num_return_sequences=num_samples,
    )

# 生成されたテキストのデコード
generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# 文に分割し、生成確率を計算
sentence_probabilities = []

def is_valid_sentence(sentence):
    if len(sentence) <= 1:
        return False
    if "\n\n" in sentence:
        return False
    if sentence[-1] in [".", "!", "?"]:
        return True
    return False

def preprocess_text(text):
    """
    Preprocess the text to remove bullet points and other irregularities.
    """
    # Remove common bullet point markers at the start of lines
    text = re.sub(r'^[\s]*[\*\-\•\·\•\◦\–\—]\s*', '', text, flags=re.MULTILINE)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

for text in generated_texts:
    # text = preprocess_text(text)
    sentences = sent_tokenize(text)
    for sentence in sentences:
        # 文の終わりに<eos>トークンを追加
        if not is_valid_sentence(sentence):
            continue
        input_ids = tokenizer(sentence + EOS, return_tensors="pt").input_ids.to("cuda:0")
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            log_likelihood = outputs.loss.item() * input_ids.size(1)
            sentence_prob = math.exp(-log_likelihood)
            sentence_probabilities.append((sentence, sentence_prob))

# 結果を表示
for sentence, prob in sentence_probabilities:
    print(f"Sentence: {sentence}\nProbability: {prob}\n{'-'*50}")

# # 結果を保存
# with open("sentence_probabilities.txt", "w") as f:
#     for sentence, prob in sentence_probabilities:
#         f.write(f"Sentence: {sentence}\nProbability: {prob}\n{'-'*50}\n")

ValueError: The following `model_kwargs` are not used by the model: ['stop'] (note: typos in the generate arguments will also show up in this list)