# Lesson 6: Use your voice

**Lesson objective**: Get voice feedback 

So far we've set up a moderately complex workflow with a human feedback loop. Let's run it through the visualizer to see what it looks like.

<div style="background-color:#fff1d7; padding:15px;"> <b> Note</b>: Make sure to run the notebook cell by cell. Please try to avoid running all cells at once.</div>

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os, json
from llama_parse import LlamaParse
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context,
    InputRequiredEvent,
    HumanResponseEvent
)
from llama_index.utils.workflow import draw_all_possible_flows
from llama_index.readers.whisper import WhisperReader
import gradio as gr
import asyncio
from queue import Queue

import nest_asyncio
nest_asyncio.apply()

from helper import get_openai_api_key, get_llama_cloud_api_key

llama_cloud_api_key = get_llama_cloud_api_key()
openai_api_key = get_openai_api_key()

In [3]:
class ParseFormEvent(Event):
    application_form: str

class QueryEvent(Event):
    query: str

class ResponseEvent(Event):
    response: str

class FeedbackEvent(Event):
    feedback: str

class GenerateQuestionsEvent(Event):
    pass

class RAGWorkflow(Workflow):
    storage_dir = "./storage"
    llm: OpenAI
    query_engine: VectorStoreIndex

    @step
    async def set_up(self, ctx: Context, ev: StartEvent) -> ParseFormEvent:

        if not ev.resume_file:
            raise ValueError("No resume file provided")

        if not ev.application_form:
            raise ValueError("No application form provided")

        # give ourselves an LLM to work with
        self.llm = OpenAI(model="gpt-4o-mini")

        # ingest our data and set up the query engine
        if os.path.exists(self.storage_dir):
            # we've already ingested our documents
            storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir)
            index = load_index_from_storage(storage_context)
        else:
            # we need to parse and load our documents
            documents = LlamaParse(
                api_key=llama_cloud_api_key,
                base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
                result_type="markdown",
                content_guideline_instruction="This is a resume, gather related facts together and format it as bullet points with headers"
            ).load_data(ev.resume_file)
            # embed and index the documents
            index = VectorStoreIndex.from_documents(
                documents,
                embed_model=OpenAIEmbedding(model_name="text-embedding-3-small")
            )
            index.storage_context.persist(persist_dir=self.storage_dir)

        # either way, create a query engine
        self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5)

        # let's pass our application form to a new step where we parse it
        return ParseFormEvent(application_form=ev.application_form)

    # we've separated the form parsing from the question generation
    @step
    async def parse_form(self, ctx: Context, ev: ParseFormEvent) -> GenerateQuestionsEvent:
        parser = LlamaParse(
            api_key=llama_cloud_api_key,
            base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
            result_type="markdown",
            content_guideline_instruction="This is a job application form. Create a list of all the fields that need to be filled in.",
            formatting_instruction="Return a bulleted list of the fields ONLY."
        )

        # get the LLM to convert the parsed form into JSON
        result = parser.load_data(ev.application_form)[0]
        raw_json = self.llm.complete(
            f"This is a parsed form. Convert it into a JSON object containing only the list of fields to be filled in, in the form {{ fields: [...] }}. <form>{result.text}</form>. Return JSON ONLY, no markdown.")
        fields = json.loads(raw_json.text)["fields"]

        await ctx.set("fields_to_fill", fields)

        return GenerateQuestionsEvent()

    # this step can get triggered either by GenerateQuestionsEvent or a FeedbackEvent
    @step
    async def generate_questions(self, ctx: Context, ev: GenerateQuestionsEvent | FeedbackEvent) -> QueryEvent:

        # get the list of fields to fill in
        fields = await ctx.get("fields_to_fill")

        # generate one query for each of the fields, and fire them off
        for field in fields:
            question = f"How would you answer this question about the candidate? <field>{field}</field>"

            if hasattr(ev,"feedback"):
                question += f"""
                    \nWe previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback>{ev.feedback}</feedback>
                """

            ctx.send_event(QueryEvent(
                field=field,
                query=question
            ))

        # store the number of fields so we know how many to wait for later
        await ctx.set("total_fields", len(fields))
        return

    @step
    async def ask_question(self, ctx: Context, ev: QueryEvent) -> ResponseEvent:
        print(f"Asking question: {ev.query}")

        response = self.query_engine.query(f"This is a question about the specific resume we have in our database: {ev.query}")

        print(f"Answer was: {str(response)}")

        return ResponseEvent(field=ev.field, response=response.response)

    # we now emit an InputRequiredEvent
    @step
    async def fill_in_application(self, ctx: Context, ev: ResponseEvent) -> InputRequiredEvent:
        # get the total number of fields to wait for
        total_fields = await ctx.get("total_fields")

        responses = ctx.collect_events(ev, [ResponseEvent] * total_fields)
        if responses is None:
            return None # do nothing if there's nothing to do yet

        # we've got all the responses!
        responseList = "\n".join("Field: " + r.field + "\n" + "Response: " + r.response for r in responses)

        result = self.llm.complete(f"""
            You are given a list of fields in an application form and responses to
            questions about those fields from a resume. Combine the two into a list of
            fields and succinct, factual answers to fill in those fields.

            <responses>
            {responseList}
            </responses>
        """)

        # save the result for later
        await ctx.set("filled_form", str(result))

        # Let's get a human in the loop
        return InputRequiredEvent(
            prefix="How does this look? Give me any feedback you have on any of the answers.",
            result=result
        )

    # Accept the feedback.
    @step
    async def get_feedback(self, ctx: Context, ev: HumanResponseEvent) -> FeedbackEvent | StopEvent:

        result = self.llm.complete(f"""
            You have received some human feedback on the form-filling task you've done.
            Does everything look good, or is there more work to be done?
            <feedback>
            {ev.response}
            </feedback>
            If everything is fine, respond with just the word 'OKAY'.
            If there's any other feedback, respond with just the word 'FEEDBACK'.
        """)

        verdict = result.text.strip()

        print(f"LLM says the verdict was {verdict}")
        if (verdict == "OKAY"):
            return StopEvent(result=await ctx.get("filled_form"))
        else:
            return FeedbackEvent(feedback=ev.response)


In [4]:
WORKFLOW_FILE = "workflows/lesson_6.html"
draw_all_possible_flows(RAGWorkflow, filename=WORKFLOW_FILE)

<class 'NoneType'>
<class '__main__.ResponseEvent'>
<class 'llama_index.core.workflow.events.InputRequiredEvent'>
<class '__main__.QueryEvent'>
<class '__main__.FeedbackEvent'>
<class 'llama_index.core.workflow.events.StopEvent'>
<class '__main__.GenerateQuestionsEvent'>
<class '__main__.ParseFormEvent'>
workflows/lesson_6.html


In [5]:
from IPython.display import display, HTML, DisplayHandle
from helper import extract_html_content

html_content = extract_html_content(WORKFLOW_FILE)
display(HTML(html_content), metadata=dict(isolated=True))

Cool! You can see the path all the way to the end and the feedback loop is clear.

<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>To access <code>fake_application_form.pdf</code>, <code>fake_resume.pdf</code>, <code>requirements.txt</code> and <code>helper.py</code> files:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>. The form and resume are inside the data folder.

<p> ⬇ &nbsp; <b>Download Notebooks:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Download as"</em> and select <em>"Notebook (.ipynb)"</em>.</p>

<p> 📒 &nbsp; For more help, please see the <em>"Appendix – Tips and Help"</em> Lesson.</p>

</div>

<p style="background-color:#f7fff8; padding:15px; border-width:3px; border-color:#e0f0e0; border-style:solid; border-radius:6px"> 🚨
&nbsp; <b>Different Run Results:</b> The output generated by AI chat models can vary with each execution due to their dynamic, probabilistic nature. Don't be surprised if your results differ from those shown in the video.</p>

## Getting voice feedback

Now, just for fun, you'll do one more thing: change the feedback from text feedback to actual words spoken out loud. To do this we'll use a different model from OpenAI called Whisper. LlamaIndex has a built-in way to transcribe audio files into text using Whisper.

Here's a function that takes a file and uses Whisper to return just the text:

In [6]:
def transcribe_speech(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
    audio_file= open(filepath, "rb")
    reader = WhisperReader(
        model="whisper-1",
        api_key=openai_api_key,
    )
    documents = reader.load_data(filepath)
    return documents[0].text

But before we can use it, you need to capture some audio from your microphone. That involves some extra steps!

First, create a callback function that saves data to a global variable.

In [7]:
def store_transcription(output):
    global transcription_value
    transcription_value = output
    return output

Now use Gradio, which has special widgets that can render inside a notebook, to create an interface for capturing audio from a microphone. When the audio is captured, it calls `transcribe_speech` on the recorded data, and calls `store_transcription` on that.

In [8]:
mic_transcribe = gr.Interface(
    fn=lambda x: store_transcription(transcribe_speech(x)),
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription"))

In Gradio, you further define a visual interface containing this microphone input and output, and then launch it:

<div style="background-color:#fff1d7; padding:15px;"> <b> Note</b>: Make sure to wait for the gradio interface to load. A popup window will appear and ask you to allow the use of your microphone. To record audio, make sure to click on record -> stop -> submit. Make sure the audio is captured before clicking on 'submit'.</div>

In [9]:
test_interface = gr.Blocks()
with test_interface:
    gr.TabbedInterface(
        [mic_transcribe],
        ["Transcribe Microphone"]
    )

test_interface.launch(
    share=False, 
    server_port=8000, 
    prevent_thread_lock=True
)

* Running on local URL:  https://0.0.0.0:8000

To create a public link, set `share=True` in `launch()`.




You can now print out the transcription, which is stored in that global variable you created earlier:

In [10]:
print(transcription_value)

NameError: name 'transcription_value' is not defined

You're going to want to run Gradio again, so it's a good idea to shut down the Gradio interface you were using. 

In [11]:
test_interface.close()

Closing server running on port: 8000


<div style="background-color:#fff1d7; padding:15px;"> <b> Note</b>: Make sure to run the previous cell to close the Gradio interface before running the next cell.</div>                                                                                          

Now you're going to create an entirely new class, a Transcription Handler. 

In [12]:
# New! Transcription handler.
class TranscriptionHandler:

    # we create a queue to hold transcription values
    def __init__(self):
        self.transcription_queue = Queue()
        self.interface = None

    # every time we record something we put it in the queue
    def store_transcription(self, output):
        self.transcription_queue.put(output)
        return output

    # This is the same interface and transcription logic as before
    # except it stores the result in a queue instead of a global
    def create_interface(self):
        mic_transcribe = gr.Interface(
            fn=lambda x: self.store_transcription(transcribe_speech(x)),
            inputs=gr.Audio(sources="microphone", type="filepath"),
            outputs=gr.Textbox(label="Transcription")
        )
        self.interface = gr.Blocks()
        with self.interface:
            gr.TabbedInterface(
                [mic_transcribe],
                ["Transcribe Microphone"]
            )
        return self.interface

    # we launch the transcription interface
    async def get_transcription(self):
        self.interface = self.create_interface()
        self.interface.launch(
            share=False,
            server_port=8000, 
            prevent_thread_lock=True
        )

        # we poll every 1.5 seconds waiting for something to end up in the queue
        while True:
            if not self.transcription_queue.empty():
                result = self.transcription_queue.get()
                if self.interface is not None:
                    self.interface.close()
                return result
            await asyncio.sleep(1.5)


Now you have a transcription handler, you can use it instead of the keyboard input interface when you're getting human input when you run your workflow:

In [None]:
w = RAGWorkflow(timeout=600, verbose=False)

handler = w.run(
    resume_file="./data/fake_resume.pdf",
    application_form="./data/fake_application_form.pdf"
)

async for event in handler.stream_events():
  if isinstance(event, InputRequiredEvent):
      # Get transcription
      transcription_handler = TranscriptionHandler()
      response = await transcription_handler.get_transcription()

      handler.ctx.send_event(
          HumanResponseEvent(
              response=response
          )
      )

response = await handler
print("Agent complete! Here's your final result:")
print(str(response))

Started parsing the file under job_id d087eecf-8e6b-4f3a-917b-b402b99960bf
Asking question: How would you answer this question about the candidate? <field>First Name</field>
Answer was: Sarah
Asking question: How would you answer this question about the candidate? <field>Last Name</field>
Answer was: Chen
Asking question: How would you answer this question about the candidate? <field>Email</field>
Answer was: sarah.chen@email.com
Asking question: How would you answer this question about the candidate? <field>Phone</field>
Answer was: The candidate's phone number is not provided in the available information.
Asking question: How would you answer this question about the candidate? <field>Linkedin</field>
Answer was: The candidate's LinkedIn profile can be found at linkedin.com/in/sarahchen.
Asking question: How would you answer this question about the candidate? <field>Project Portfolio</field>
Answer was: The candidate has a project portfolio that includes notable projects such as EcoTr

## Congratulations!

You've successfully created an AI agent that responds to spoken feedback.

## Resources

To learn more about agentic document workflows, you check this [article](https://www.llamaindex.ai/blog/introducing-agentic-document-workflows) and theses [example implementations](https://github.com/run-llama/llamacloud-demo/tree/main/examples/document_workflows).