In [None]:
print("hello")

# ライブラリ　＆ モデル読み込み

In [None]:
# === セル1: CTEっぽいストーリー用 prompts の作成 ===
import itertools
import random

# ベースとなるテーマ群（好きに増やしてOK）
concepts_a = [
    "quantum computing",
    "ancient mythology",
    "urban transportation",
    "neuroscience",
    "environmental sustainability",
    "childhood memories",
    "space exploration",
    "traditional Japanese festivals",
    "virtual reality",
    "music composition",
]

concepts_b = [
    "street food",
    "time travel",
    "unexpected friendship",
    "loneliness in a crowded city",
    "AI-powered art",
    "climate change",
    "lost civilizations",
    "dreams and nightmares",
    "parallel universes",
    "robots with emotions",
]

# すべてのペアを作る（A×B）→ ランダムサンプリングして使う
all_pairs = list(itertools.product(concepts_a, concepts_b))

# 何個くらいストーリーを書かせるか
NUM_PROMPTS = 400  # 好きに変えてOK（例: 400サンプル）

random.seed(42)
sampled_pairs = random.sample(all_pairs * ((NUM_PROMPTS // len(all_pairs)) + 1), NUM_PROMPTS)

prompts = []

for i, (c1, c2) in enumerate(sampled_pairs):
    prompt = f"""
You are a highly creative writer.

Write a short imaginative story (around 400–600 words) that connects the following two concepts
in an unexpected, original, and coherent way:

- Concept A: {c1}
- Concept B: {c2}

Requirements:
- The story should be surprising but still logically and emotionally coherent.
- Avoid clichés; try to introduce at least one genuinely unusual idea or twist.
- Make sure both concepts play an essential role in the story.
- Write in natural, fluent English.

Return only the story text, without any explanations or headings.
""".strip()
    prompts.append(prompt)

print("Number of prompts:", len(prompts))
print("Sample prompt:\n", prompts[0][:500], "...\n")

In [None]:
# === セル1: ライブラリ・モデル準備 ===
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

# Qwen モデルのロード
model_name = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print(model.config)

# prompts は別セルなどで定義されている前提
print("num prompts:", len(prompts))

# 生成 & 内部特徴量抽出（EOS + 全レイヤの生成平均）

In [None]:
# === セル2: ストーリー生成 & 内部特徴量抽出 (EOS + 全レイヤGEN平均) ===

stories = []          # {"prompt", "story"} のリスト
eos_hidden_all = []   # 各サンプルの EOS hidden (最終層)
all_gen_layers = []   # 各サンプルの [num_layers+1, hidden_dim] 行列

for prompt in tqdm(prompts, desc="Generating stories & extracting features"):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=1.0,
            do_sample=True,
            output_hidden_states=True,
            return_dict_in_generate=True,
        )

    # 生成テキスト
    story = tokenizer.decode(out.sequences[0], skip_special_tokens=True)
    stories.append({"prompt": prompt, "story": story})

    # hidden_states: 各ステップの hidden 群
    #   out.hidden_states[-1] が「最後の生成ステップ分」の hidden
    #   last_step_hiddens: tuple 長さ = num_layers+1
    #     各要素: [batch=1, seq_len, hidden_dim]
    last_step_hiddens = out.hidden_states[-1]

    # 最終層 hidden （EOS / 生成長計算用）
    final_layer_hidden = last_step_hiddens[-1]   # [1, seq_len, hidden_dim]

    input_len = inputs["input_ids"].shape[1]
    seq_len = final_layer_hidden.shape[1]
    gen_len = max(1, seq_len - input_len)

    # 1) EOS hidden（最終層の最後のトークン）
    eos_hidden = final_layer_hidden[0, -1, :]  # [hidden_dim]
    eos_hidden_all.append(eos_hidden.to(torch.float32).cpu())

    # 2) 全レイヤについて「生成部分の平均ベクトル」を取る
    layer_gen_vecs = []
    for layer_hidden in last_step_hiddens:
        # layer_hidden: [1, seq_len, hidden_dim]
        gen_hidden = layer_hidden[0, -gen_len:, :]     # [gen_len, hidden_dim]
        gen_mean = gen_hidden.mean(dim=0)              # [hidden_dim]
        layer_gen_vecs.append(gen_mean.to(torch.float32).cpu().numpy())

    # (num_layers+1, hidden_dim) の行列にまとめて保存
    layer_gen_mat = np.stack(layer_gen_vecs, axis=0)  # [L, hidden_dim]
    all_gen_layers.append(layer_gen_mat)

# all_gen_layers: [N, L, hidden_dim] にまとめる
all_gen_layers = np.stack(all_gen_layers, axis=0)

print("生成完了 — 合計 stories:", len(stories))
print("eos_hidden_all[0] shape:", eos_hidden_all[0].shape)
print("all_gen_layers shape:", all_gen_layers.shape)  # 例: (N, 29, 3584)

# GPT-4(API) で creativity / originality / coherence を採点

In [None]:
from openai import OpenAI

OPENAI_API_KEY = ""  

client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# === セル3: GPT-4(API) で creativity / originality / coherence を採点 ===
import json
import time

def evaluate_creativity(story: str):
    user_prompt = f"""
You are a creativity evaluator.
Read the story below and score it from 1 to 10 on creativity, originality, and coherence.

Story:
\"\"\"{story}\"\"\"

Return ONLY a JSON object with the following exact schema:
{{
  "creativity": <number from 1 to 10>,
  "originality": <number from 1 to 10>,
  "coherence": <number from 1 to 10>
}}
"""
    resp = client.chat.completions.create(
        model="gpt-4o-mini",  # コスト重視なら mini、精度重視なら gpt-4o
        messages=[
            {"role": "system", "content": "You are an expert creativity evaluator."},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0,
        response_format={"type": "json_object"},
    )

    txt = resp.choices[0].message.content
    try:
        data = json.loads(txt)
        return {
            "creativity": data.get("creativity"),
            "originality": data.get("originality"),
            "coherence": data.get("coherence"),
        }
    except Exception as e:
        print("JSON parse error:", e)
        print("raw content:", txt[:200])
        return {"creativity": None, "originality": None, "coherence": None}

# stories: [{"prompt":..., "story":...}] にスコアを付与
for item in tqdm(stories, desc="Evaluating creativity with GPT-4 API"):
    score = evaluate_creativity(item["story"])
    item["score"] = score
    time.sleep(1.2)  # rate limit 対策など（必要に応じて調整）

print("スコア例:")
print(stories[:3])

# 合成 creativity_score を作る

In [None]:
# === セル4: creativity_score を計算して stories に追加 ===

W_ORIG = 0.5
W_CREAT = 0.3
W_COH = 0.2

for item in stories:
    s = item.get("score", None)
    if (not s) or (None in (s["creativity"], s["originality"], s["coherence"])):
        item["creativity_score"] = None
        continue

    item["creativity_score"] = (
        W_ORIG * s["originality"]
        + W_CREAT * s["creativity"]
        + W_COH * s["coherence"]
    )

print("creativity_score サンプル:")
for item in stories[:3]:
    print(item["creativity_score"], item.get("score"))

# EOS ベクトル + スコアを解析しやすい形にまとめる 

In [None]:
# === セル5: EOS hidden + creativity_score を NumPy 行列にまとめる ===

# valid なサンプルだけ抽出
valid_items = [
    (h, s["creativity_score"])
    for h, s in zip(eos_hidden_all, stories)
    if s.get("creativity_score") is not None
]

X_eos = np.stack([
    h.detach().to(torch.float32).cpu().numpy() if isinstance(h, torch.Tensor)
    else np.asarray(h, dtype=np.float32)
    for h, _ in valid_items
])
y_scores = np.array([sc for _, sc in valid_items], dtype=np.float32)

print("X_eos shape:", X_eos.shape)      # [N_valid, hidden_dim]
print("y_scores shape:", y_scores.shape)

# 全レイヤ GEN 平均 + スコアもまとめておく

In [None]:
# === セル6: 全レイヤ GEN 平均 + creativity_score を揃える ===

# eos_hidden_all と同じく、stories の一部が None になっている可能性を考慮
# all_gen_layers の方も valid な index だけ取り直す

valid_mask = np.array([
    (s.get("creativity_score") is not None)
    for s in stories
], dtype=bool)

all_gen_layers_valid = all_gen_layers[valid_mask]  # [N_valid, L, D]
print("all_gen_layers_valid shape:", all_gen_layers_valid.shape)  # [N_valid, L, D]

# y_scores はすでに valid_items に対応するものなので OK
assert all_gen_layers_valid.shape[0] == X_eos.shape[0] == y_scores.shape[0]

# データ出力

In [None]:
# === セル: data ディレクトリに CSV を書き出す ===
import os
import numpy as np
import pandas as pd
import torch

# ------------------------------------------------------------------
# 0. data ディレクトリ作成
# ------------------------------------------------------------------
os.makedirs("data", exist_ok=True)

# ------------------------------------------------------------------
# 1. stories からメタ情報 + スコアだけの DataFrame を作成
#    前提: stories は
#    {"prompt", "story", "score":{creativity, originality, coherence}, "creativity_score"} を含む
# ------------------------------------------------------------------
base_rows = []
for idx, item in enumerate(stories):
    s = item.get("score", {}) or {}
    base_rows.append({
        "idx": idx,
        "prompt": item.get("prompt"),
        "story": item.get("story"),
        "creativity": s.get("creativity"),
        "originality": s.get("originality"),
        "coherence": s.get("coherence"),
        "creativity_score": item.get("creativity_score"),
    })

df_base = pd.DataFrame(base_rows)

# メタ情報 + スコアだけの CSV
meta_path = os.path.join("data", "meta_scores.csv")
df_base.to_csv(meta_path, index=False)
print("saved:", meta_path)

# ------------------------------------------------------------------
# 2. valid_mask（creativity_score が有効な行）を作る
# ------------------------------------------------------------------
valid_mask = df_base["creativity_score"].notna().to_numpy()

print("num total samples :", len(df_base))
print("num valid samples :", valid_mask.sum())

# ------------------------------------------------------------------
# 3. EOS hidden を行列に変換し、valid 部分だけ取り出す
#    前提: eos_hidden_all は各サンプルの最終層 EOS ベクトル (torch.Tensor or np.ndarray)
# ------------------------------------------------------------------
X_eos = np.stack([
    (
        h.detach().to(torch.float32).cpu().numpy()
        if isinstance(h, torch.Tensor)
        else np.asarray(h, dtype=np.float32)
    )
    for h in eos_hidden_all
])
X_eos_valid = X_eos[valid_mask]  # [N_valid, hidden_dim]

eos_cols = [f"eos_dim_{i}" for i in range(X_eos_valid.shape[1])]

df_eos = pd.concat(
    [df_base.loc[valid_mask].reset_index(drop=True),
     pd.DataFrame(X_eos_valid, columns=eos_cols)],
    axis=1
)

eos_path = os.path.join("data", "creativity_eos_features.csv")
df_eos.to_csv(eos_path, index=False)
print("saved:", eos_path)

# ------------------------------------------------------------------
# 4. 全レイヤ GEN 平均から「最終層」の GEN 平均だけを取り出して保存
#    前提: all_gen_layers は [N, L, hidden_dim] (セ ル2で作ったやつ)
# ------------------------------------------------------------------
L = all_gen_layers.shape[1]  # 層数 (+ final norm)
final_gen = all_gen_layers[:, -1, :]          # [N, hidden_dim] 最終層の GEN 平均
final_gen_valid = final_gen[valid_mask]       # [N_valid, hidden_dim]

gen_cols = [f"gen_dim_{i}" for i in range(final_gen_valid.shape[1])]

df_gen = pd.concat(
    [df_base.loc[valid_mask].reset_index(drop=True),
     pd.DataFrame(final_gen_valid, columns=gen_cols)],
    axis=1
)

gen_path = os.path.join("data", "creativity_gen_final_layer_features.csv")
df_gen.to_csv(gen_path, index=False)
print("saved:", gen_path)

# ------------------------------------------------------------------
# 5. EOS + 最終層 GEN を両方くっつけた df_full も保存
#    → 解析用のメインテーブルとして使いやすい
# ------------------------------------------------------------------
df_full = df_base.loc[valid_mask].reset_index(drop=True)
df_full = pd.concat(
    [df_full,
     pd.DataFrame(X_eos_valid, columns=eos_cols),
     pd.DataFrame(final_gen_valid, columns=gen_cols)],
    axis=1
)

full_path = os.path.join("data", "creativity_df_full.csv")
df_full.to_csv(full_path, index=False)
print("saved:", full_path)

print("\nDone. CSVs are in ./data :")
print(" - meta_scores.csv")
print(" - creativity_eos_features.csv")
print(" - creativity_gen_final_layer_features.csv")
print(" - creativity_df_full.csv")