In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# モデルとトークナイザーの読み込み
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# テスト用プロンプト
prompt = "What is the capital of France?"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# 出力例
output_1 = "The capital of France is Paris."
output_2 = "France's capital is Lyon."

# ペアワイズ比較データ
feedback = {"output_1": 1, "output_2": 0}  # output_1が優れていると評価

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# 報酬モデル
class RewardModel(nn.Module):
    def __init__(self):
        super(RewardModel, self).__init__()
        self.fc = nn.Linear(768, 1)  # GPT-2の隠れ層サイズに合わせる

    def forward(self, x):
        return self.fc(x)

reward_model = RewardModel()
optimizer = optim.Adam(reward_model.parameters(), lr=1e-4)

# 損失関数の計算と学習
def train_reward_model(output_1, output_2, feedback):
    reward_1 = reward_model(torch.randn(1, 768))  # ダミー入力
    reward_2 = reward_model(torch.randn(1, 768))
    loss = -torch.log(torch.sigmoid(reward_1 - reward_2) * feedback["output_1"]
                      + torch.sigmoid(reward_2 - reward_1) * feedback["output_2"])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

# 学習の実行
for epoch in range(10):
    loss = train_reward_model(output_1, output_2, feedback)
    print(f"Epoch {epoch}, Loss: {loss}")

In [None]:
def update_policy():
    reward = reward_model(torch.randn(1, 768))  # ダミー報酬
    loss = -reward.mean()  # 報酬を最大化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

# ポリシーの更新
for step in range(10):
    policy_loss = update_policy()
    print(f"Step {step}, Policy Loss: {policy_loss}")