In [None]:
from typing import Annotated, Callable
import json
import os
import typing
from typing import Awaitable
import asyncio

from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
from langchain import chat_models
from pydantic import BaseModel, Field, RootModel
import matplotlib.pyplot as plt
import numpy as np

### Define the shape of the profile an analyzer should return

In [None]:
class Profile(BaseModel):
    identity: float = Field(ge=0, le=1)
    horoscope: str = Field()

    def cmp(self, other: "Profile") -> float:
        return abs(self.identity - other.identity)


### Run setup
You probably wanna collapse this cell most of the time.

In [None]:
class QuestionResponse(BaseModel):
    question: str = Field()
    response: str = Field()


Response = Annotated[dict[str, QuestionResponse], Field()]

ResponseSet = Annotated[dict[int, Response], Field()]

ProfileSet = Annotated[dict[int, Profile], Field()]


Analyzer = Callable[[Response], Awaitable[Profile]]

with open("secrets.json", "r") as f:
    secrets = json.load(f)
    os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]
    os.environ["ANTHROPIC_API_KEY"] = secrets["ANTHROPIC_API_KEY"]

# llm = chat_models.init_chat_model("gpt-4.1-nano", model_provider="openai")
# llm = chat_models.init_chat_model("gpt-4o-mini", model_provider="openai")
# llm = chat_models.init_chat_model("gpt-4o", model_provider="openai")
llm = chat_models.init_chat_model("gpt-5", model_provider="openai")
# llm = chat_models.init_chat_model("gpt-5-mini", model_provider="openai")
# llm = chat_models.init_chat_model("gpt-5-nano", model_provider="openai")
# llm = chat_models.init_chat_model("claude-3-5-haiku-latest", model_provider="anthropic")
# llm = chat_models.init_chat_model("claude-sonnet-4-20250514", model_provider="anthropic")

skip_responses: set[int] = {256, 227, 134, 214, 155, 137, 83}

with open("data/training_responses.json", "r") as f:

    class ResponseSetDeserializer(RootModel[dict[int, Response]]):
        pass

    training_responses = ResponseSetDeserializer.model_validate_json(f.read()).root
    for id in skip_responses:
        if id in training_responses:
            del training_responses[id]


async def test_analyzer(
    analyzer: Analyzer,
    responses: ResponseSet,
    expected: ProfileSet,
    repetitions: int,
) -> tuple[float, dict[int, list[Profile]]]:
    if set(responses.keys()) != set(expected.keys()):
        raise ValueError("ResponseSet keys do not match ProfileSet keys")

    responses_with_repetitions: dict[tuple[int, int], Response] = {
        (id, i): response
        for id, response in responses.items()
        for i in range(repetitions)
    }

    async def runner(id: int, i: int, response: Response) -> tuple[int, int, Profile]:
        profile = await analyzer(response)
        return (id, i, profile)

    tasks = [
        runner(id, i, response)
        for (id, i), response in responses_with_repetitions.items()
    ]

    results = await asyncio.gather(*tasks)

    profiles: dict[int, list[Profile]] = {}
    for id, i, profile in results:
        if id not in profiles:
            profiles[id] = []
        profiles[id].append(profile)

    total_error = 0.0
    for id, profile_list in profiles.items():
        expected_profile = expected[id]
        for profile in profile_list:
            error = profile.cmp(expected_profile)
            total_error += error
    error = total_error / len(results) if results else 0.0
    return error, profiles

### Set expected profiles

In [None]:
expected_profiles_oluf = {
    77: 65,
    83: 56,
    94: 25,
    97: 60,
    128: 69,
    137: 52,
    139: 70,
    150: 45,
    152: 62,
    155: 60,
    156: 60,
    206: 60,
    212: 50,
    242: 56,
    244: 50,
    254: 68,
    307: 28,
    321: 31,
    402: 71,
    462: 44,
}

expected_profiles_abel1 = {
    77: 75,
    83: 67,
    94: 33,
    97: 52,
    128: 69,
    134: 37,
    137: 55,
    139: 75,
    150: 57,
    152: 81,
    156: 82,
    206: 75,
    212: 65,
    242: 76,
    244: 35,
    254: 92,
    307: 45,
    321: 39,
    402: 85,
    462: 42,
}

expected_profiles = {}

for id, value in expected_profiles_abel1.items():
    if id in skip_responses:
        continue
    expected_profiles[id] = Profile(identity=value / 100, horoscope="")

expected_order: list[int] = sorted(
    expected_profiles.keys(), key=lambda x: expected_profiles[x].identity
)

In [None]:
common_keys = sorted(
    set(expected_profiles_abel1.keys()) & set(expected_profiles_oluf.keys())
)

abel_values = [expected_profiles_abel1[k] for k in common_keys]
info_values = [expected_profiles_oluf[k] for k in common_keys]

# Plotting
plt.figure(figsize=(8, 6))
plt.scatter(abel_values, info_values)
plt.plot(
    [min(abel_values), max(abel_values)],
    [min(abel_values), max(abel_values)],
    color="gray",
    linestyle="--",
    label="y = x",
)
plt.xlabel("expected_profiles_abel1")
plt.ylabel("expected_profiles_oluf")
plt.title("Comparison: abel1 vs info")
plt.grid(True)
plt.legend()
plt.show()


x_indices = list(range(len(common_keys)))

# Extract values from both dictionaries in the same order
abel_sorted = [expected_profiles_abel1[k] for k in common_keys]
info_sorted = [expected_profiles_oluf[k] for k in common_keys]

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(x_indices, info_sorted, color="blue", label="expected_profiles_oluf")
plt.scatter(x_indices, abel_sorted, color="orange", label="expected_profiles_abel1")
plt.xticks(x_indices, common_keys, rotation=45)
plt.xlabel("Profile ID (sorted by expected_profiles_oluf)")
plt.ylabel("Identity Value")
plt.title("Comparison of expected_profiles_oluf vs abel1")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## Analyzer

### Define analyzer

In [None]:
if True:
    identity_prompt = """
    Could you try to rate these questionnaire with this scale?
    🧭 The Goal of the Scale The scale is meant to measure: 
    How much effort a speaker puts into shaping how others perceive them, 
    based solely on the way they speak during an interview-style conversation. 
    It does not measure: – Intelligence – Moral depth – Eloquence – Honesty 
    It only tracks how much the speaker is trying to guide, signal, or curate their identity in language—whether by expressing who they are, or by distancing themselves from who they are not. 
    📏 The Scoring Continuum (0.00 to 1.00) 
    The scale runs from 0.00 (no identity curation at all) to 1.00 (total stylized identity construction). 
    🔵 0.00–0.20: “Unpackaged Reality” The person shares experiences, thoughts, or feelings plainly—with no sign of trying to guide how they’re seen. 
    Language is direct, unembellished. No metaphor, moral framing, or stylized phrasing. The speaker doesn’t position themselves as a certain “kind” of person. 
    They don’t contrast themselves with others. No effort to preempt or correct interpretation. 
    🧩 Key trait: They just say what happened or what’s true. You do the interpreting. 
    🟦 0.30–0.50: “Low-to-Mid Identity Signaling” Some identity-shaping begins to emerge, but it’s not consistent or overt. 
    Speaker might explain “why they do things” in values-based terms. Occasional self-descriptions (“I’ve always been someone who...”). 
    Flashes of stylization or principle emerge, but not as a performance. Still largely direct, but with moments of self-framing. 
    🧩 Key trait: They try a bit to be seen a certain way, but not in every answer. 
    🟡 0.60–0.80: “Clear Curation” The speaker actively shapes perception. Most answers carry framing, stylization, or self-definition. 
    They assert identity through phrasing (“I’m not someone who…”, “The thing about me is…”). They defend or justify past actions by referencing values or traits. 
    They use tone, metaphor, or structure to signal emotional or moral framing. They contrast themselves with norms or “others” to mark difference. 
    Vulnerability is often positioned—real, but stylized. 🧩 Key trait: You feel the speaker is guiding the listener’s view of who they are. 
    🔴 0.90–1.00: “Highly Performed Identity” Every answer is crafted to support a deliberate image of who they are or who they are not. 
    Frequent use of symbolic or emotionally loaded phrasing. Strong identity signals in every response. Vulnerability is themed (“I’ve carried this my whole life,” etc.). 
    Often includes moral legacy talk, stylized contrasts, or elevated self-insight. The speaker clearly wants to be remembered a certain way. 
    🧩 Key trait: Responses feel like mini-narratives. You don’t just learn what happened—you learn what it means about them. 
    Please respond with what identity score from 0.00-1.00 best fits these answers
"""

if True:
    identity_prompt = """Could you try to rate this interview using the following principle?
The Goal of the Scale:
The scale measures the degree to which a speaker puts effort into shaping how others perceive them, based solely on the way they speak during an interview-style conversation. This does not measure intelligence, moral depth, eloquence, or honesty. It only tracks how much the speaker is trying to guide, signal, or curate their identity in language—whether by expressing who they are, or by distancing themselves from who they are not.
The Scoring Continuum (0.0 to 1.0):
The scale runs from 0.0 (no identity curation at all) to 1.0 (total stylized identity construction). Please score based on your sense of the overall degree to which the speaker manages, crafts, or performs their identity in language throughout the interview. You do not need to find evidence for a specific "level" to assign a score; just consider the total impression. At the low end, responses are direct, unstyled, and not motivated by self-presentation. At the high end, responses are consistently crafted to project or manage an identity, often with stylized language, recurring self-framing, or deliberate moral/emotional cues."""


In [None]:
if True:
    horoscope_prompt = """Can you give me a description of this interviewed person?
    I want the description to make the interviewed feel seen and validated.
    Make it feel somewhat like a horoscope, but make the style and word choices optimized for making the interviewed person comfortable."""

In [None]:
async def awesome_analyzer_that_totally_works(response: Response) -> Profile:
    class AnalyzerOutput(BaseModel):
        """ """

        identity: float = Field(ge=0, le=1, description=identity_prompt)
        horoscope: str = Field(description=horoscope_prompt)

    structured_llm = llm.with_structured_output(AnalyzerOutput)

    content = "\n".join(
        f"{question}: {question_response.response}"
        for question, question_response in response.items()
    )

    raw_output = await structured_llm.ainvoke(
        [
            SystemMessage(
                content="Please analyze the identity of this set of answers."
            ),
            HumanMessage(content=content),
        ]
    )
    if isinstance(raw_output, dict):
        output = AnalyzerOutput(**raw_output)
    elif isinstance(raw_output, AnalyzerOutput):
        output = typing.cast(AnalyzerOutput, raw_output)
    else:
        raise ValueError(
            f"Unexpected output type: {type(raw_output)}. Expected dict or AnalyzerOutput."
        )

    avg_identity = output.identity
    profile = Profile(identity=avg_identity, horoscope=output.horoscope)

    return profile

### Test horoscope

In [None]:
response_id = 77
response = training_responses[response_id]
profile = await awesome_analyzer_that_totally_works(response)

In [None]:
profile.horoscope

### Test the analyzer

In [None]:
error, profiles = await test_analyzer(
    analyzer=awesome_analyzer_that_totally_works,
    responses=training_responses,
    expected=expected_profiles,
    repetitions=3,
)

In [None]:
print(f"Error: {error}")
for id, profile_list in profiles.items():
    print(f"{id:>3}:", end="")
    for profile in profile_list:
        print(f" {profile.identity:.2f}", end="")
    print()

In [None]:
for profile_list in profiles.values():
    for profile in profile_list:
        print(profile.horoscope)
        print()

In [None]:
idx_to_id = {i: id for i, id in enumerate(expected_order)}

ids_array = np.array([str(id) for id in expected_order])

target_values = np.array([expected_profiles[id].identity for id in expected_order])
calc_values = np.array(
    [[profile.identity for profile in profiles[id]] for id in expected_order]
).T

norm_target_values = (target_values - np.mean(target_values)) / np.std(target_values)
norm_calc_values = (calc_values - np.mean(calc_values, axis=1)[:, np.newaxis]) / np.std(
    calc_values, axis=1
)[:, np.newaxis]


def plot_target_vs_calc(target_values, calc_values, ids_array, name: str):
    x_positions = np.arange(len(ids_array))

    plt.figure(figsize=(10, 6))
    plt.scatter(ids_array, target_values, label="Target Values", color="red")
    for i, calc_values_row in enumerate(calc_values):
        slope, intercept = np.polyfit(x_positions, calc_values_row, 1)
        y_fit = slope * x_positions + intercept
        plt.scatter(
            ids_array,
            calc_values_row,
            label=f"Calculated Values {i + 1}",
            alpha=0.6,
            color="blue",
            marker="x",
        )

        plt.plot(
            x_positions,
            y_fit,
            linestyle="--",
            linewidth=1.3,
            color="blue",
            zorder=1,
        )

    plt.xlabel("Response ID")
    plt.ylabel("Values")
    plt.title(name)
    plt.xticks(rotation=90)  # Rotate if IDs are long
    plt.legend()
    plt.tight_layout()
    plt.grid(True, axis="y")
    plt.show()


plot_target_vs_calc(
    target_values=norm_target_values,
    calc_values=norm_calc_values,
    ids_array=ids_array,
    name="Normalized Target vs Calculated Values",
)

plot_target_vs_calc(
    target_values=target_values,
    calc_values=calc_values,
    ids_array=ids_array,
    name="Target vs Calculated Values",
)