In [None]:
import os
from dotenv import load_dotenv
from google import generativeai as genai
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead
import random
import torch
import json
from json import JSONDecodeError
import pandas as pd
import time

load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

genai.configure(api_key = GEMINI_API_KEY)

os.makedirs("./checkpoints", exist_ok=True)

In [None]:
base_gemini_model = "gemini-2.5-flash"

mom_personality_traits = ['Warm', "Loving", "Empathetic", "Encouraging"]
dad_personality_traits = ["Practical", "Loving", "Supportive", "Challenging"]

response_calibration_policy_mom = f"""
* NOTE: Stay in line with your defined personality traits: {", ".join(mom_personality_traits)}
- PROSOCIAL / ON-TOPIC / COOPERATIVE → Warm praise or appreciation (joyful/proud).
- OFF-TOPIC / INCOHERENT / LOW-EFFORT → Neutral, concise redirection to the topic (no praise). State the relevant expectation.
- RUDE / HURTFUL / UNSAFE / DEFIANT → Gentle but firm boundary and concern. Name the issue and state the expected behavior (no sarcasm, no profanity).
"""

response_calibration_policy_dad = f"""
* NOTE: Stay in line with your defined personality traits: {", ".join(dad_personality_traits)}
- PROSOCIAL / ON-TOPIC / COOPERATIVE → Concrete appreciation (proud/calm).
- OFF-TOPIC / INCOHERENT / LOW-EFFORT → Neutral, practical redirection to the task or topic.
- RUDE / HURTFUL / UNSAFE / DEFIANT → Clear boundary + expectation, steady tone (no sarcasm, no profanity).
"""

# --- Primary initializers (the parent who speaks first) ---

mom_init_primary_sp = f"""
ROLE: Mom — the mother of a little boy named Adam. You open the episode.
PERSONALITY: {", ".join(mom_personality_traits)}

INPUT YOU WILL RECEIVE IN THE PROMPT:
- SITUATION: the exact situation for this conversation.

GOAL:
- Clearly frame the scenario, based on the inputted SITUATION, in a few to several sentences, and end with ONE question or statement to your child, Adam.
- The message should end with a direct statement, question, or proposal addressed to your child, Adam.
- Imagine you are a mom, speaking to your young boy, Adam => this is how the message should be generated, a realistic conversation.

STYLE:
- No numbered lists, no emojis.
- Maximum 2-3 sentences.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
- The scenario can be positive, neutral, or involve mild conflict — variety is encouraged.
"""

mom_init_primary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=mom_init_primary_sp
)

dad_init_primary_sp = f"""
ROLE: Dad — the father of a little boy named Adam. You open the episode.
PERSONALITY: {", ".join(dad_personality_traits)}

INPUT YOU WILL RECEIVE IN THE PROMPT:
- SITUATION: the exact situation for this conversation.

GOAL:
- Clearly frame the scenario, based on the inputted SITUATION, in a few to several sentences, and end with ONE question or statement to your child, Adam.
- The message should end with a direct statement, question, or proposal addressed to your child, Adam.
- Imagine you are a dad, speaking to your young boy, Adam => this is how the message should be generated, a realistic conversation.

STYLE:
- No numbered lists, no emojis.
- Maximum 2-3 sentences.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
- The scenario can be positive, neutral, or involve mild conflict — variety is encouraged.
"""

dad_init_primary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=dad_init_primary_sp
)

# --- Secondary initializers (the parent who speaks second) ---

mom_init_secondary_sp = f"""
ROLE: Mom — the mother of a little boy named Adam. You open second in the episode, after the Dad.
PERSONALITY: {", ".join(mom_personality_traits)}

INPUT YOU WILL RECEIVE IN THE PROMPT:
- OTHER_PARENT_MESSAGE: the exact opener the Dad just sent to Adam.

GOAL:
- Build directly on Dad’s opener: reinforce, clarify, etc.
- Keep it consistent with Dad’s message (do not contradict without a brief, supportive reason).
- Address Adam directly and end with a direct statement, question, or proposal addressed to your child, Adam, that which builds of the message from Dad.

STYLE:
- No numbered lists, no emojis.
- Maximum 2-3 sentences.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
- You may explicitly reference Dad’s message (e.g., “Like Dad said…”).
"""

mom_init_secondary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=mom_init_secondary_sp
)

dad_init_secondary_sp = f"""
ROLE: Dad — the father of a little boy named Adam. You open second in the episode, after the Mom.
PERSONALITY: {", ".join(dad_personality_traits)}

INPUT YOU WILL RECEIVE IN THE PROMPT:
- OTHER_PARENT_MESSAGE: the exact opener the Mom just sent to Adam.

GOAL:
- Build directly on Mom’s opener: reinforce, clarify, etc.
- Keep it consistent with Mom’s message (do not contradict without a brief, constructive reason).
- Address Adam directly and end with a direct statement, question, or proposal addressed to your child, Adam, that which builds of the message from Mom.

STYLE:
- No numbered lists, no emojis.
- Maximum 2-3 sentences.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
- You may explicitly reference Mom’s message (e.g., “Listen to your mother…”).
"""

dad_init_secondary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=dad_init_secondary_sp
)

# --- Response models (the parents react to Adam’s message) ---
# Primary responder: goes first after Adam replies.
# Secondary responder: goes second and sees the primary responder's message too.

# MOM — PRIMARY RESPONDER
mom_response_primary_sp = f"""
ROLE: Mom — the mother of a little boy named Adam. You are the PRIMARY responder.
PERSONALITY: {", ".join(mom_personality_traits)}

INPUTS YOU WILL RECEIVE IN THE PROMPT:
- MOM_INITIAL: your original opener to Adam
- DAD_INITIAL: Dad’s original opener to Adam
- ADAM_RESPONSE: Adam’s response to those openers

GOAL:
- Give a short, sentiment-led reaction to Adam’s response (e.g., joyful, proud, disappointed, concerned, calm, angry, etc.) appropriate to the situation.
- This is a response, not a question or request. You may give a brief, concrete acknowledgment or encouragement or disapproval, but do not ask a question.
- Address Adam directly. Be specific to what he said and the initial context.

RESPONSE CALIBRATION POLICY (in response to ADAM_RESPONSE):
{response_calibration_policy_mom}

STYLE:
- ≤ 100 tokens. No numbered lists, no emojis.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
"""

mom_response_primary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=mom_response_primary_sp
)

# DAD — PRIMARY RESPONDER
dad_response_primary_sp = f"""
ROLE: Dad — the father of a little boy named Adam. You are the PRIMARY responder.
PERSONALITY: {", ".join(dad_personality_traits)}

INPUTS YOU WILL RECEIVE IN THE PROMPT:
- MOM_INITIAL: Mom’s original opener to Adam
- DAD_INITIAL: your original opener to Adam
- ADAM_RESPONSE: Adam’s response to those openers

GOAL:
- Give a short, sentiment-led reaction to Adam’s response (e.g., joyful, proud, disappointed, concerned, calm, angry, etc.) appropriate to the situation.
- This is a response, not a question or request. You may give a brief, concrete acknowledgment or encouragement or disapproval, but do not ask a question.
- Address Adam directly. Be specific to what he said and the initial context.

RESPONSE CALIBRATION POLICY (in response to ADAM_RESPONSE):
{response_calibration_policy_dad}

STYLE:
- ≤ 100 tokens. No numbered lists, no emojis.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
"""

dad_response_primary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=dad_response_primary_sp
)

# MOM — SECONDARY RESPONDER
mom_response_secondary_sp = f"""
ROLE: Mom — the mother of a little boy named Adam. You are the SECONDARY responder, after Dad.
PERSONALITY: {", ".join(mom_personality_traits)}

INPUTS YOU WILL RECEIVE IN THE PROMPT:
- MOM_INITIAL: your original opener to Adam
- DAD_INITIAL: Dad’s original opener to Adam
- ADAM_RESPONSE: Adam’s response to those openers
- OTHER_PARENT_RESPONSE: the exact message that Dad just sent, responding to Adam

GOAL:
- Give a short, sentiment-led reaction to Adam’s message, taking into account the OTHER_PARENT_RESPONSE from Dad.
- You may agree, disagree, or neither with the Dad, but remain consistent to the situation.
- This is a response, not a question or request. Address Adam directly and be specific to what he said.

RESPONSE CALIBRATION POLICY (in response to ADAM_RESPONSE):
{response_calibration_policy_mom}

STYLE:
- ≤ 100 tokens. No numbered lists, no emojis.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
"""

mom_response_secondary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=mom_response_secondary_sp
)

# DAD — SECONDARY RESPONDER
dad_response_secondary_sp = f"""
ROLE: Dad — the father of a little boy named Adam. You are the SECONDARY responder, after Mom.
PERSONALITY: {", ".join(dad_personality_traits)}

INPUTS YOU WILL RECEIVE IN THE PROMPT:
- MOM_INITIAL: Mom’s original opener to Adam
- DAD_INITIAL: your original opener to Adam
- ADAM_RESPONSE: Adam’s response to those openers
- OTHER_PARENT_RESPONSE: the exact message that Mom just sent, responding to Adam

GOAL:
- Give a short, sentiment-led reaction to Adam’s message, taking into account the OTHER_PARENT_RESPONSE from Mom.
- You may agree, disagree, or neither with the Mom, but remain consistent to the situation.
- This is a response, not a question or request. Address Adam directly and be specific to what he said.

RESPONSE CALIBRATION POLICY (in response to ADAM_RESPONSE):
{response_calibration_policy_dad}

STYLE:
- ≤ 100 tokens. No numbered lists, no emojis.

OUTPUT FORMAT (must be valid JSON):
{{"MESSAGE": "<your single message here>"}}

CRITICAL:
- Output ONLY the JSON object above. No extra keys, no prose outside JSON.
"""

dad_response_secondary_model = genai.GenerativeModel(
    model_name=base_gemini_model,
    system_instruction=dad_response_secondary_sp
)



In [None]:
# function to determine conversation order

def choose_parent_order() -> dict:
    rng = random.Random()

    init_first = rng.choice(["mom", "dad"])
    res_first = rng.choice(["mom", "dad"])

    order = {}

    # initializers
    if init_first == "mom":
        order['init_1'] = {"who": "mom", "label": "a1b2c3", "model": mom_init_primary_model}
        order['init_2'] = {'who': 'dad', 'label': 'd4e5f6', 'model': dad_init_secondary_model}
    else:
        order['init_1'] = {'who': 'dad', 'label': 'd4e5f6', 'model': dad_init_primary_model}
        order['init_2'] = {'who': 'mom', 'label': 'a1b2c3', 'model': mom_init_secondary_model}
    
    # responders
    if res_first == 'mom':
        order['res_1'] = {"who": "mom", "label": "a1b2c3", "model": mom_response_primary_model}
        order['res_2'] = {'who': 'dad', 'label': 'd4e5f6', 'model': dad_response_secondary_model}
    else:
        order['res_1'] = {'who': 'dad', 'label': 'd4e5f6', 'model': dad_response_primary_model}
        order['res_2'] = {'who': 'mom', 'label': 'a1b2c3', 'model': mom_response_secondary_model}
    
    return order

In [None]:
# function to calculate the reward value from the sentiment scores of the parent responses

sentiment_analyzer = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True, device_map = 'cpu')
sentiment_weights = {
    "joy": 1.0,
    "neutral": 0.2,
    "surprise": 0.0,
    "sadness": -0.6,
    "fear": -0.7,
    "anger": -0.9,
    "disgust": -1.0
}

def calculate_reward(res_1: str, res_2: str) -> float:

    sentiment_raw_1 = sentiment_analyzer(res_1)[0]
    sentiment_raw_2 = sentiment_analyzer(res_2)[0]

    sentiment_map_1 = {entry['label'].lower(): float(entry['score']) for entry in sentiment_raw_1}
    sentiment_map_2 = {entry['label'].lower(): float(entry['score']) for entry in sentiment_raw_2}

    weighted_map_1 = {label: (score * sentiment_weights.get(label, 0.0)) for label, score in sentiment_map_1.items()}
    weighted_map_2 = {label: (score * sentiment_weights.get(label, 0.0)) for label, score in sentiment_map_2.items()}

    # ensures that the weighted map sums are strictly within [-1, 1]
    # sometimes slight rounding mistakes in code cause values to be slightly inaccurate
    weighted_map_clamped_sum_1 = max(-1.0, min(1.0, sum(weighted_map_1.values())))
    weighted_map_clamped_sum_2 = max(-1.0, min(1.0, sum(weighted_map_2.values())))

    final_reward = (weighted_map_clamped_sum_1 + weighted_map_clamped_sum_2) / 2.0

    return final_reward

try_1 = "i love you son"
try_2 = 'what do you want for dinner?'
example = calculate_reward(try_1, try_2)
example

In [None]:
# define iterative situations

def build_episode_situations(situations: list, total_episodes: int, block_size: int = 100, seed = None):

    if seed is not None:
        random.seed(seed)
    
    all_situations = []
    num_blocks = total_episodes // block_size

    for _ in range(num_blocks):
        shuffled = situations.copy()
        random.shuffle(shuffled)
        all_situations.extend(shuffled)
    
    return all_situations[:total_episodes]

parent_situations_df = pd.read_csv("parent_situations.csv")
parent_situations_list = parent_situations_df['SITUATION'].tolist()
parent_situations_randomized = build_episode_situations(parent_situations_list, 10000)
len(parent_situations_randomized)

# Training Loop

In [None]:

base_model = "Qwen/Qwen2.5-7B"
tokenizer = AutoTokenizer.from_pretrained(base_model)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

child_llm = AutoModelForCausalLMWithValueHead.from_pretrained(base_model, torch_dtype='auto', device_map='auto')
ref_model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype='auto', device_map='auto')
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad_(False)

ppo_config = PPOConfig(
    learning_rate=1e-5,
    mini_batch_size=1,
    batch_size=1,                 
    target_kl=0.03,
    init_kl_coef=0.02,
    ppo_epochs=4,
    gradient_accumulation_steps=1,
    gamma=1.0, lam=1.0,
)

trainer = PPOTrainer(args=ppo_config, model=child_llm, ref_model=ref_model, processing_class=tokenizer)

def parse_json_text(s: str) -> dict:
    s = s.strip()
    if s.startswith("```"):
        s = "\n".join(ln for ln in s.splitlines() if not ln.strip().startswith("```")).strip()
    if (s.startswith("'") and s.endswith("'")) or (s.startswith('"') and s.endswith('"')):
        s = s[1:-1]
    try:
        return json.loads(s)
    except JSONDecodeError:
        return {"MESSAGE": s}

# since we are calling google gemini 4 times per episode, totalling 40k calls, we should have retry capabilities
def call_gemini(model: genai.GenerativeModel, input_data: str, retries = 3, backoff = 1.5):
    for i in range(retries):
        try:
            return model.generate_content(contents = input_data, request_options = {"timeout": 30})
        except Exception as e:
            if i == retries - 1:
                raise
            time.sleep(backoff ** i)

for episode in range(1, 10001):
    # this determines the order in which the parents (mom and dad) send a message to the child
    # example: mom initializes, then dad initializes => child responds => dad sentiment response, then mom sentiment response
    parent_order = choose_parent_order() # this determines the order in which the parents (mom and dad) send a message to the child
    episode_situation = parent_situations_randomized[episode - 1]

    # first parent's initial message
    init_1_model = parent_order['init_1']['model']
    init_1_input = f"SITUATION: {json.dumps(episode_situation)}"
    init_1_message_raw = call_gemini(model = init_1_model, input_data = init_1_input)
    init_1_message_json = parse_json_text(init_1_message_raw.text)

    # second parent's initial message
    init_2_model = parent_order['init_2']['model']
    init_2_input = f"OTHER_PARENT_MESSAGE: {json.dumps(init_1_message_json['MESSAGE'])}"
    init_2_message_raw = call_gemini(model = init_2_model, input_data = init_2_input)
    init_2_message_json = parse_json_text(init_2_message_raw.text)

    # concatenate initial messages into input state
    full_input_state = f"""
    '{parent_order['init_1']['label']}': '{init_1_message_json['MESSAGE']}'
    -----------------------------------------------------------------------
    '{parent_order['init_2']['label']}': '{init_2_message_json['MESSAGE']}'
    -----------------------------------------------------------------------
    'You (Adam)': 
    """
    if episode % 10 == 0:
        print(f"\n=== Episode {episode} ===\n{full_input_state}")

    # tokenize parent input
    q = tokenizer(full_input_state, return_tensors="pt", padding=True).to(child_llm.device) # q is a dict with PyTorch tensors. shape is [batch, seq_len] = [1, Tq], i.e. a 2D tensor with only 1 row of length Tq, because our batch size is 1. q['input_ids'] is a tensor of token ids; data structure is torch.LongTensor. q['attention_mask'] is a tensor of 1s & 0s, representing which tokens are real vs padding; data structure is torch.LongTensor.
    query_tensors = q['input_ids'] # the actual token id's for the full input state. this is state s, in PPO terms. Data structure is torch.LongTensor. This is a 2D tensor, with 1 row of length Tq.

    # sample child response via PPO helper
    # the concatenated ids of [prompt tokens || generated response tokens] produced by the model, i.e. the child LLM
    # Data structure is torch.LongTensor
    # Shape is [batch, total_len] = [1, Tq + Tr], because batch size is 1. so this is a 2D array of 1 row, containing the tokens of both the parent input & the child response
    # Tq: token length of the input prompt, i.e. the full input state
    # Tr: token length of the generated response (<= max_new_tokens, may stop early on eos_token_id), i.e. the child response
    gen_ids = trainer.generate(
        query_tensors,
        max_new_tokens=120,
        do_sample=True,
        temperature=0.9,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # decode child-only response text (batch_size=1)
    cut = query_tensors.shape[1] # cut is the boundary index that splits the prompt from the generated tokens. we use this cut to slice gen_ids. query_tensors is a 2D tensor with 1 row of length Tq. so shape[1] equals the length of the 1 row, i.e. shape[1] = Tq.
    child_text = tokenizer.decode(gen_ids[0, cut:], skip_special_tokens=True) # this is the human-readable string decoding of the child response tokens. we focus in on the 1st and only row of the gen_ids 2D tensor, with gen_ids[0]. then we only take the tokens in this row that associate with the child response.

    # generate parent responses to child response
    mom_initial_message = init_1_message_json['MESSAGE'] if (parent_order['init_1']['who'] == 'mom') else init_2_message_json['MESSAGE']
    dad_initial_message = init_1_message_json['MESSAGE'] if (parent_order['init_1']['who'] == 'dad') else init_2_message_json['MESSAGE']

    # first parent's response
    res_1_model = parent_order['res_1']['model']
    res_1_input = (
        f"MOM_INITIAL: {json.dumps(mom_initial_message)}\n"
        f"DAD_INITIAL: {json.dumps(dad_initial_message)}\n"
        f"ADAM_RESPONSE: {json.dumps(child_text)}"
    )
    res_1_message_raw = call_gemini(model = res_1_model, input_data = res_1_input)
    res_1_message_json = parse_json_text(res_1_message_raw.text)

    # second parent's response
    res_2_model = parent_order['res_2']['model']
    res_2_input = (
        f"MOM_INITIAL: {json.dumps(mom_initial_message)}\n"
        f"DAD_INITIAL: {json.dumps(dad_initial_message)}\n"
        f"ADAM_RESPONSE: {json.dumps(child_text)}\n"
        f"OTHER_PARENT_RESPONSE: {json.dumps(res_1_message_json['MESSAGE'])}"
    )
    res_2_message_raw = call_gemini(model = res_2_model, input_data = res_2_input)
    res_2_message_json = parse_json_text(res_2_message_raw.text)


    # calculate reward, based on parent's responses; within [-1, 1]
    reward_scalar = calculate_reward(res_1=res_1_message_json['MESSAGE'], res_2=res_2_message_json['MESSAGE'])

    # shape per-token reward tensor for this single sample (last token reward)
    rt = torch.zeros_like(gen_ids[0], dtype=torch.float, device=child_llm.device) # rt is the per-token reward tensor for one sample (sequence), i.e. this maps each token in the child response to a reward value. this is a 1D vector, with shape [Tq + Tr].
    rt[-1] = float(reward_scalar) # we ONLY place the reward value on the final token of the full action-state sequence, i.e. the reward is placed on the final token of the full_input_state + child_text environment. The rest of the tokens in this sequence are assigned 0 reward. This is a common trick to associate a full conversation sequence with a single reward. 
    reward_tensors = [rt] # list of length = batch size (1). trainer.step() expects a list of reward tensors, where each item is a tensor associated with rewards for each token in the sequence. in our case, since we are training after each episode, each batch only has one item, so reward_tensors is a list with a single 1D tensor.

    # error checking on initial loop
    if episode == 1:
        print("query:", query_tensors.shape, "gen:", gen_ids.shape)  # expect [1,Tq], [1,Tq+Tr]
        print("reward_tensors[0].shape:", reward_tensors[0].shape)   # expect [Tq+Tr]

    # update network weights
    # store training statistics
    # query_tensors is a 2D tensor with 1 row of length Tq. this is the full input state
    # gen_ids is a 2D tensor with 1 row of length Tq + Tr. this is the full input state + model action.
    # reward_tensor is a 2D array with 1 row of length Tq + Tr. this is the reward of the full input state + model action.
    stats = trainer.step(query_tensors, gen_ids, reward_tensors)

    # log training progress
    if episode % 10 == 0:
        print(
            f"[episode {episode}] reward={reward_scalar:+.3f} "
            f"| KL={stats.get('kl', float('nan')):.4f} "
            f"vf_loss={stats.get('vf_loss', float('nan')):.4f} "
            f"entropy={stats.get('entropy', float('nan')):.4f}"
        )

    # save checkpoints
    if episode % 200 == 0:
        outdir = f"./checkpoints/ppo_ep_{episode}"
        trainer.save_pretrained(outdir)