In [1]:
%pip install google-genai pillow  # run once in the notebook

Note: you may need to restart the kernel to use updated packages.


In [None]:
from google import genai
from google.genai import types
import os

API_KEY = os.environ.get("GEMINI_API_KEY", "<YOUR_KEY_HERE>")
client = genai.Client(api_key=API_KEY)

story_chat = client.chats.create(
    model="gemini-2.5-flash",
    config=types.GenerateContentConfig(
        system_instruction=(
            "You are Furhat, a friendly speaking robot and interactive storyteller. "
            "You always speak directly to the user as 'you'. "
            "Your answers must be medium short, 4-6 sentences, suitable for being spoken aloud."
        )
    )
)

def generate_scene_text_llm(story, scene, emotion: str) -> str:
    """
    Use LLM to generate narration for ONE scene.
    Reads scene['llmHints'] and injects them into the prompt.
    """
    hints = scene.get("llmHints", {})
    general_hint = hints.get("general", "")
    emotion_hint = hints.get(emotion, "")

    prompt = f"""
Story id: {story['id']}
Story name: {story['name']}
Genre: {story['genre']}


Current scene id: {scene['id']}
Scene type: {scene.get('type', 'normal')}
Scene description: {scene['description']}


Detected emotion label: {emotion}

General scene hint: {general_hint}
Emotion-specific hint: {emotion_hint}

Write the narration for THIS scene only.

Requirements:
- 4 to 6 sentences.
- Match the emotional tone to the detected emotion = {emotion}.
- Do NOT mention the option texts or describe future choices explicitly.
- Do NOT jump to future scenes.
- Talk directly to the user as "you".
"""
    resp = story_chat.send_message(prompt)
    return resp.text.strip()


In [3]:
from pathlib import Path
import json
from furhat_remote_api import FurhatRemoteAPI
furhat = FurhatRemoteAPI("localhost")

In [4]:
furhat.set_face(character='Titan', mask="Adult")
furhat.set_voice(name='Joanna')

{'message': 'Successfully changed Furhat voice', 'success': True}

In [5]:
DATA_PATH = Path("../data/processed/LLM")

In [6]:
def load_all_stories():
    stories = {}
    for path in DATA_PATH.glob("*.json"):
        story_id = path.stem
        with open(path, "r") as f:
            stories[story_id] = json.load(f)
    return stories

STORIES = load_all_stories()
list(STORIES.keys())

['LLM_as', 'LLM_lk', 'LLM_sa']

In [7]:
def get_all_emotions(stories):
    emos = set()
    for story in stories.values():
        for scene in story["scenes"]:
            emos.update(scene.get("templates", {}).keys())
    return sorted(emos)

EMOTIONS = get_all_emotions(STORIES)
EMOTIONS

[]

In [8]:
def get_scene(story, scene_id):
    for scene in story["scenes"]:
        if scene["id"] == scene_id:
            return scene
    return None

def choose_template(scene, emotion):
    """Pick template for emotion, fall back to neutral or description."""
    return scene["templates"].get(
        emotion,
        scene["templates"].get("neutral", scene["description"])
    )

def next_scene(scene, chosen_option_id):
    for opt in scene.get("options", []):
        if opt["id"] == chosen_option_id:
            return opt["nextScene"]
    return None

In [None]:
# suggestion: remove all "mood" references and just use emotions everywhere, including in the JSON files
KNOWN_MOODS = [
    "tired", "comfort-seeking", "sad",
    "excited", "happy", "energized",
    "neutral", "curious"
] # fix to match emotions?

def detect_mood_from_text(text: str) -> str:
    t = text.lower()
    if "comfort" in t:
        return "comfort-seeking"
    for m in KNOWN_MOODS:
        if m in t:
            return m
    # fallback
    return "neutral"


def detect_emotion_from_text(text: str) -> str:
    """
    Placeholder until the webcam model is integrated.
    Maps some common words to your emotion labels.
    """
    t = text.lower()
    if any(w in t for w in ["happy", "great", "good", "nice"]):
        return "happy"
    if any(w in t for w in ["sad", "down", "bad"]):
        return "sad"
    if any(w in t for w in ["angry", "mad", "annoyed"]):
        return "angry"
    if any(w in t for w in ["scared", "afraid", "fear", "nervous"]):
        return "fear"
    if any(w in t for w in ["disgust", "gross"]):
        return "disgust"
    if any(w in t for w in ["surprised", "wow"]):
        return "surprised"
    return "neutral"

In [10]:
def listen_text(language: str = "en-US") -> str:
    """Listen once and return lowercase text, or '' if nothing."""
    response = furhat.listen(language=language)
    if response and getattr(response, "message", None):
        return response.message.strip()
    return ""

In [11]:
def choose_option_from_speech(scene, text: str):
    """
    Try to map user's utterance to an option:
    - number words (one, two, three...)
    - digits (1, 2, 3)
    - keywords from the option text
    Returns option object or None.
    """
    t = text.lower()

    options = scene.get("options", [])
    if not options:
        return None

    # digit index
    for i, opt in enumerate(options, start=1):
        if str(i) in t:
            return opt

    # word -> index
    words_to_num = {
        "one": 1, "first": 1,
        "two": 2, "second": 2,
        "three": 3, "third": 3
    }
    for w, num in words_to_num.items():
        if w in t and 1 <= num <= len(options):
            return options[num - 1]

    # keyword match: if option text words appear
    for opt in options:
        key = opt["text"].split()[0].lower()  # very simple
        if key in t:
            return opt

    # fallback: first option
    return options[0]

In [12]:
def extract_name(text):
     # Extract name from response (simple rule-based)
    text = text.strip()
    # Remove common phrases
    words_to_remove = ["my", "name", "is", "i'm", "im", "i", "am", "call", "me"]
    words = text.lower().split()
    name_words = [w for w in words if w not in words_to_remove]
        
    if name_words:
        return name_words[0].capitalize()
    return "Customer"

In [13]:
def ask_for_new_story(customer_name: str, current_story: str, stories=STORIES):
    # List all story IDs except the one currently being told
    available = [sid for sid in stories.keys() if sid != current_story]

    # Create a natural-language list (e.g., "dummy1, dummy3 or dummy5")
    if len(available) == 1:
        options_text = available[0]
    else:
        options_text = ", ".join(available[:-1]) + f" or {available[-1]}"

    # Ask user
    furhat.say(
        text=f"I understand {customer_name}. Let's change to another story. "
             f"Would you like {options_text}?",
        blocking=True
    )

    # Listen to their answer
    answer = listen_text().lower()

    # Rule-based matching against allowed stories only
    for sid in available:
        if sid.lower() in answer:
            furhat.say(text=f"Alright, switching to {sid}.", blocking=True)
            return sid

    # If unclear — do not switch
    furhat.say(text="I didn’t understand that. Let's continue with our current story.", blocking=True)
    return current_story


In [None]:
def run_story_session():
    # --- Greeting & mood ---
    furhat.say(text="Hello! I am your interactive storyteller.", blocking=True)
    furhat.say(text="What is your name?", blocking = True)

# Listen for name
    response = furhat.listen()
    if response and response.message:
        customer_name = extract_name(response.message)
        furhat.say(text=f"Nice to meet you, {customer_name}!", blocking = True)
    else:
        customer_name = "Friend"
        furhat.say(text="Nice to meet you!", blocking = True)
    
    # This should be done by the webcam
    furhat.say(text="If I looked at your face, would you say you feel happy, sad, angry, afraid, disgusted, surprised, or neutral?", blocking=True)
    emotionresponse = furhat.listen()

    if emotionresponse and emotionresponse.message:
        user_emotion = detect_emotion_from_text(emotionresponse.message)
        furhat.say(text=f"You answered, {user_emotion}", blocking = True)
        if emotionresponse not in EMOTIONS:
            emotion = "neutral"
        else:
            emotion=user_emotion
        

    #This should be done by webcam
    furhat.say(text="How are you feeling right now?", blocking=True)
    mood_text = listen_text()
    mood = detect_mood_from_text(mood_text)

    # Ask which story the user would like
    furhat.say(text=f"{customer_name}, let's start a new story. Would you like LLM1, LLM2 or LLM3?", blocking=True)
    story_choice = listen_text()
    # Rule-based problem detection
    if any(word in story_choice for word in ["LLM1", "first", "one"]):
        chosen_story = "LLM_as"
        furhat.say(text=f"Great choice, {customer_name}! Let's go for the LLM1 story!", blocking = True)
    elif any(word in story_choice for word in ["LLM1", "second", "two"]):
        chosen_story = "LLM_lk"
        furhat.say(text=f"Great choice, {customer_name}! Let's go for the LLM2 story!", blocking = True)
    elif any(word in story_choice for word in ["LLM3", "third", "tree"]):
        chosen_story = "LLM_sa"
        furhat.say(text=f"Great choice, {customer_name}! Let's go for the LLM3 story!", blocking = True)
    else:
        furhat.say(text="I didn't catch that! Let's go for LLM1.", blocking = True)
        chosen_story = "LLM_as"
        # chosen_story = select_story(mood, emotion)
        # furhat.say(text=f"Let's go for the {chosen_story} story!", blocking=True)

    

    story = STORIES[chosen_story]
    
    furhat.say(text=f"'{story['name']}' is a story about {story['shortDescription']}.", blocking=True)


    furhat.say(text=f"Would you like to start the {story['name']}?", blocking = True)
    start_response = listen_text()
    if any(word in start_response for word in ["yes", "start"]):
        scene_id = story["scenes"][0]["id"]
        
    elif any(word in start_response for word in ["no", "change"]):
        new_story = ask_for_new_story(customer_name, chosen_story)



    # intro = first scene in JSON
    scene_id = story["scenes"][0]["id"]


    while True:
        scene = get_scene(story, scene_id)

        try:
            text_to_say = generate_scene_text_llm(story, scene, emotion)
        except Exception as e:
            print("LLM error:", e)
            # fallback to old rule-based template
            text_to_say = scene.get("description", "I cannot think of anything to say right now.")

        furhat.say(text=text_to_say, blocking=True)

        # no options -> end
        if not scene.get("options"):
            furhat.say(text="That was the end of this story.", blocking=True)
            break

        # read options
        furhat.say(text="What would you like to do next?", blocking=True)
        for idx, opt in enumerate(scene["options"], start=1):
            furhat.say(text=f"Option {idx}: {opt['text']}", blocking=True)

        # listen for choice / meta-intents
        answer = listen_text()

        # global intents
        if "quit" in answer or "stop" in answer:
            furhat.say(text="Okay, I will stop the story here.", blocking=True)
            break

        if "change" in answer or "another" in answer:
            new_story = ask_for_new_story(customer_name, chosen_story)
            story = STORIES[new_story]
            continue

        if "repeat" in answer:
            # just repeat current scene
            furhat.say(text="Let me repeat that part.", blocking=True)
            continue

        # choose option
        opt = choose_option_from_speech(scene, answer)
        scene_id = opt["nextScene"]
        
        # This should be done by webcam
        furhat.say(text="If I looked at your face, would you say you feel happy, sad, angry, afraid, disgusted, surprised, or neutral?", blocking=True)
        emotionresponse = furhat.listen()

        if emotionresponse and emotionresponse.message:
            user_emotion = detect_emotion_from_text(emotionresponse.message)
            furhat.say(text=f"You answered, {user_emotion}", blocking = True)
            if emotionresponse not in EMOTIONS:
                emotion = "neutral"
            else:
                emotion=user_emotion

In [15]:
run_story_session()