In [1]:
from copy import deepcopy
from pathlib import Path
import os
import re
from copy import deepcopy
from pathlib import Path
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_parse import LlamaParse
from llama_index.core.schema import TextNode
from typing import Optional


In [2]:
import nest_asyncio

nest_asyncio.apply()

## Setting Up LlamaIndex and OpenAI Models

In this section, we initialize the `Settings` for LlamaIndex with the OpenAI embedding and language models. We use the `OpenAIEmbedding` model for embeddings and the `OpenAI` model for language processing.

In [8]:
OPENAI_API_KEY = os.getenv('PERSONAL_OAI_API_KEY') 
LLAMA_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')

In [9]:
embed_model = OpenAIEmbedding(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
llm = OpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

Settings.embed_model = embed_model
Settings.llm = llm

In [10]:
parser = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt-4o-mini",
    vendor_multimodal_api_key=OPENAI_API_KEY,
    verbose=True,
    fast_mode=True,
    num_workers=8,
)


In [11]:
if os.path.exists("input\WSQ - Project Management with Generative AI (GenAI) - v3.pdf"):
    print("File exists")

File exists


In [12]:
print(f"Parsing slide deck...")
md_json_objs = parser.get_json_result("input\WSQ - Project Management with Generative AI (GenAI) - v3.pdf")
md_json_list = md_json_objs[0]["pages"]

Parsing slide deck...
Started parsing the file under job_id ded5715c-130d-4631-b814-f6bfbb644d48


In [13]:
print(md_json_list[1]["text"])

                        Digital Attendance
●
   It is mandatory for you to take both AM and
   PM digital attendance especially for WSQ
   courses.                                                           Scan with Singpass appto log in
●  The trainer or administrator will show you
   the digital attendance QR code generated
   from TPG.
●  Please scan the QR code from your                                      Singpass
   Singpass App and submit your
   attendance.
                                                                                              2
                     This material belongs to Tertiary Infotech Pte Ltd (UEN: 20120096W). All Rights Reserved


In [14]:
print(md_json_list[1].keys())

dict_keys(['page', 'text', 'images', 'charts', 'items', 'status', 'links', 'width', 'height', 'triggeredAutoMode', 'structuredData', 'noStructuredContent', 'noTextContent'])


In [15]:
image_dicts = parser.get_images(md_json_objs, download_path="data_images")

> Image for page 1: [{'name': 'img_p0_3.png', 'height': 137, 'width': 300, 'x': 15.348425687999999, 'y': 219.125947956912, 'original_width': 300, 'original_height': 137, 'ocr': [{'x': 106, 'y': 34, 'w': 116, 'h': 24, 'confidence': '0.9842973394927044', 'text': 'SINGAPORE'}, {'x': 21, 'y': 47, 'w': 64, 'h': 40, 'confidence': '0.9812295739401005', 'text': 'WS'}, {'x': 106, 'y': 54, 'w': 194, 'h': 26, 'confidence': '0.4937106920834399', 'text': 'WorkforCE SKILL8'}, {'x': 106, 'y': 75, 'w': 159, 'h': 24, 'confidence': '0.6521510474818355', 'text': 'QUALIFICATIONS'}]}, {'name': 'img_p0_4.png', 'height': 80, 'width': 350, 'x': 82.51181366399999, 'y': 311.43105327288, 'original_width': 350, 'original_height': 80, 'ocr': [{'x': 2, 'y': 6, 'w': 348, 'h': 74, 'confidence': '0.37336243117196827', 'text': 'Umfmtech'}]}, {'name': 'img_p0_5.png', 'height': 400, 'width': 400, 'x': 12.000000384, 'y': 306.91363186847997, 'original_width': 400, 'original_height': 400, 'ocr': [{'x': 86, 'y': 40, 'w': 246

In [16]:
from llama_index.core.schema import TextNode
from typing import Optional

In [17]:
# get pages loaded through llamaparse
import re


def get_page_number(file_name):
    match = re.search(r"-page-(\d+)\.jpg$", str(file_name))
    if match:
        return int(match.group(1))
    return 0


def _get_sorted_image_files(image_dir):
    """Get image files sorted by page."""
    raw_files = [f for f in list(Path(image_dir).iterdir()) if f.is_file()]
    sorted_files = sorted(raw_files, key=get_page_number)
    return sorted_files

In [18]:
from copy import deepcopy
from pathlib import Path


# attach image metadata to the text nodes
def get_text_nodes(json_dicts, image_dir=None):
    """Split docs into nodes, by separator."""
    nodes = []

    image_files = _get_sorted_image_files(image_dir) if image_dir is not None else None
    md_texts = [d["text"] for d in json_dicts]

    for idx, md_text in enumerate(md_texts):
        chunk_metadata = {"page_num": idx + 1}
        # Check if an image exists for the current index
        if idx < len(image_files):
            chunk_metadata["image_path"] = str(image_files[idx])
        else:
            chunk_metadata["image_path"] = None  # No image available for this page
        
        # if image_files is not None:
        #     image_file = image_files[idx]
        #     chunk_metadata["image_path"] = str(image_file)
        chunk_metadata["parsed_text_markdown"] = md_text
        node = TextNode(
            text="",
            metadata=chunk_metadata,
        )
        nodes.append(node)

    return nodes

In [19]:
# this will split into pages
text_nodes = get_text_nodes(md_json_list, image_dir="data_images")

In [20]:
print(text_nodes[10].get_content(metadata_mode="all"))

page_num: 11
image_path: data_images\ded5715c-130d-4631-b814-f6bfbb644d48-img_p115_1.png
parsed_text_markdown:                                  Course Outline
Topic 1: Project Initiation and Planning with GenAI (K2, A1)
 ●    Initiating a Project - Overview
 ●    Develop Project Charter
 ●    Identify Stakeholders
 ●    Develop Project Management Plan
 ●    Plan Stakeholder Engagement
 ●    Plan Scope Management
 ●    Plan Schedule Management
 ●    Plan Cost Management
 ●    Plan Quality Management
 ●    Plan Resource Management
 ●    Plan Communications Management
 ●    Plan Risk Management
 ●    Plan Risk Responses
 ●    Plan Procurement Management                                                                       11
                         This material belongs to Tertiary Infotech Pte Ltd (UEN: 20120096W). All Rights Reserved


In [21]:
import os
from llama_index.core import (
    StorageContext,
    SummaryIndex,
    load_index_from_storage,
)

if not os.path.exists("storage_nodes_summary"):
    index = SummaryIndex(text_nodes)
    # save index to disk
    index.set_index_id("summary_index")
    index.storage_context.persist("./storage_nodes_summary")
else:
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir="storage_nodes_summary")
    # load index
    index = load_index_from_storage(storage_context, index_id="summary_index")

In [35]:
from pydantic import BaseModel
from openai import OpenAI
from docx import Document
from typing import Dict, List
from llama_parse import LlamaParse
import nest_asyncio
import os
import json

# API access to llama-cloud
LLAMA_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

# Using OpenAI API for embeddings/llms
OPENAI_API_KEY = os.getenv("TERTIARY_INFOTECH_API_KEY")

# Initialize the OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Define the Pydantic model for structured data extraction
class KnowledgeStatement(BaseModel):
    id: str
    text: str


class AbilityStatement(BaseModel):
    id: str
    text: str


class Topic(BaseModel):
    name: str
    subtopics: List[str]
    tsc_knowledges: List[KnowledgeStatement]
    tsc_abilities: List[AbilityStatement]


class LearningUnit(BaseModel):
    name: str
    topics: List[Topic]
    learning_outcome: str


class AssessmentMethod(BaseModel):
    code: str
    duration: str


class FacilitatorGuideExtraction(BaseModel):
    course_title: str
    tsc_proficiency_level: str
    learning_units: List[LearningUnit]
    assessments: List[AssessmentMethod]  # New field for assessments

# Function to load FG document content using LlamaParse
def parse_with_llamaparse(file_path: str) -> str:
    """
    Parses the FG document using LlamaParse to preprocess its content.

    Args:
        file_path (str): Path to the document.

    Returns:
        str: Preprocessed content of the document as text.
    """

    parser = LlamaParse(
        api_key=LLAMA_API_KEY,
        result_type="markdown",
        show_progress=True,
        verbose=True,
        num_workers=8
    )

    result = parser.get_json_result(file_path)
    return result

# Function to load FG document content
def load_fg_document(file_path: str) -> str:
    """
    Reads the content of a Word document.

    Args:
        file_path (str): Path to the document.

    Returns:
        str: The document's text content.
    """
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Function to extract structured data from FG document
def extract_fg_data(preprocessed_content: str) -> FacilitatorGuideExtraction:
    """
    Extracts relevant information from the FG document using GPT and Pydantic.

    Args:
        preprocessed_content (str): Preprocessed text content of the FG document.

    Returns:
        FacilitatorGuideExtraction: Extracted and validated data.
    """
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    """You are an expert at structured data extraction. Extract the following details from the FG Document:
                    - Course Title
                    - TSC Proficiency Level
                    - Learning Units (LUs):
                        * Name of the Learning Unit
                        * Topics in the Learning Unit:
                            - Name of the Topic
                            - Description of the Topic (bullet points or sub-topics)
                            - Full Knowledge Statements associated with the topic, including their identifiers and text (e.g., K1: Range of AI applications)
                            - Full Ability Statements associated with the topic, including their identifiers and text (e.g., A1: Analyze algorithms in the AI applications)
                        * Learning Outcome (LO) for each Learning Unit
                    - Assessment Types and Durations:
                        * Extract assessment types and their durations in the format:
                          {"code": "WA-SAQ", "duration": "1 hr"}
                          {"code": "PP", "duration": "0.5 hr"}
                          {"code": "CS", "duration": "30 mins"}
                        * Interpret abbreviations of assessment methods to their correct types (e.g., "WA-SAQ," "PP," "CS").
                        * Include total durations if mentioned.

                    Return the output in a JSON format that matches the schema provided:
                    {
                        "course_title": "string",
                        "tsc_proficiency_level": "string",
                        "learning_units": [
                            {
                                "name": "string",
                                "topics": [
                                    {
                                        "name": "string",
                                        "subtopics": ["string"],
                                        "tsc_knowledges": [
                                            {"id": "string", "text": "string"}
                                        ],
                                        "tsc_abilities": [
                                            {"id": "string", "text": "string"}
                                        ]
                                    }
                                ],
                                "learning_outcome": "string"
                            }
                        ],
                        "assessments": [
                            {"code": "string", "duration": "string"}
                        ]
                    }
                    """
                ),
            },
            {"role": "user", "content": preprocessed_content},
        ],
        response_format=FacilitatorGuideExtraction,
    )

    return completion.choices[0].message.parsed

# Main Execution
if __name__ == "__main__":
    # Path to the FG document
    file_path = "input/FG_TGS-2024049183 - Project Management with Generative AI (GenAI)_v1.docx"
    
    # Parse document content using LlamaParse
    try:
        parsed_content = parse_with_llamaparse(file_path)
        print("Preprocessed Content:")
        print(json.dumps(parsed_content, indent=4))
    except Exception as e:
        print(f"Error during LlamaParse parsing: {e}")
        exit(1)

    # Extract and validate structured data using OpenAI and Pydantic
    try:
        extracted_data = extract_fg_data(json.dumps(parsed_content))
        print("Structured Data:")
        print(extracted_data)
    except Exception as e:
        print(f"Error during OpenAI extraction: {e}")


Started parsing the file under job_id ab8037a6-c891-42da-8ed5-b06ea54f6a3b
Preprocessed Content:
[
    {
        "pages": [
            {
                "page": 1,
                "text": "             Facilitator Guide\n                         For\n\nProject Management with Generative AI (GenAI)\n\n          TGS Ref No: TGS-2024049183\n\n                   Conducted by\n         TERTIARY INFOTECH PTE. LTD\n\n                UEN: 201200696W\n\n                    Version 1.0",
                "md": "# Facilitator Guide\n\n# For\n\n# Project Management with Generative AI (GenAI)\n\nTGS Ref No: TGS-2024049183\n\n# Conducted by\n\nTERTIARY INFOTECH PTE. LTD\n\nUEN: 201200696W\n\nVersion 1.0",
                "images": [
                    {
                        "name": "img_p0_1.png",
                        "height": 278,
                        "width": 277,
                        "x": 225.25,
                        "y": 157.90076377952806,
                        "original_widt

In [49]:
import pprint
pprint.pprint(dict(extracted_data))

{'assessments': [AssessmentMethod(code='WA-SAQ', duration='1 hr'),
                 AssessmentMethod(code='CS', duration='1 hr')],
 'course_title': 'Project Management with Generative AI (GenAI)',
 'learning_units': [LearningUnit(name='LU1: Project Initiation and Planning with GenAI', topics=[Topic(name='Project Initiation and Planning with GenAI', subtopics=['Initiating a Project - Overview', 'Develop Project Charter', 'Identify Stakeholders', 'Develop Project Management Plan', 'Plan Stakeholder Engagement', 'Plan Scope Management', 'Plan Schedule Management', 'Plan Cost Management', 'Plan Quality Management', 'Plan Resource Management', 'Plan Communications Management', 'Plan Risk Management', 'Plan Risk Responses', 'Plan Procurement Management'], tsc_knowledges=[KnowledgeStatement(id='K2', text='Building project master plans')], tsc_abilities=[AbilityStatement(id='A1', text='Develop project parameters according to business requirements')])], learning_outcome='Develop project paramet

In [36]:
llm_config={
    "config_list": [
        {
            'model': "gpt-4o",
            'api_key': OPENAI_API_KEY,
        },
    ],
    "timeout": 300,
}

## Short Answer Question (SAQ) Generation

In this section, we generate Short Answer Questions (SAQs) based on the extracted data from the Facilitator Guide (FG) document. The process involves the following steps:

1. **Retrieve Content for Knowledge Statements**:
    - The function `retrieve_content_for_knowledge_statement` retrieves content related to the knowledge statements from the provided data using the query engine.

2. **Generate Questions and Answers**:
    - The `qa_generation_agent` generates scenario-based question-answer pairs for each knowledge statement.
    - The questions are designed to align with Bloom's Taxonomy Level 3 (Applying) and provide concise, practical insights.

3. **Output Format**:
    - The generated questions and answers are structured in a JSON format, ensuring they are ready for integration into assessment documents.

4. **Assessment Duration**:
    - The duration for the SAQ assessment is extracted from the provided data.

5. **Execution**:
    - The `user_proxy_agent` initiates the chat with the `qa_generation_agent` to generate the questions and answers based on the retrieved content and specified instructions.

In [37]:
from llama_index.llms.openai import OpenAI as llama_openai
from autogen import AssistantAgent, UserProxyAgent
from autogen.cache import Cache
import streamlit as st
import json
import re
import pprint

class FacilitatorGuideExtraction(BaseModel):
    course_title: str
    tsc_proficiency_level: str
    learning_units: List[LearningUnit]
    assessments: List[AssessmentMethod]

In [38]:
# Retrieve contents for the short answer questions
def retrieve_content_for_knowledge_statement(extracted_data: FacilitatorGuideExtraction, engine):
    """
    Retrieves content related to the knowledge statements from the provided data.

    Args:
        extracted_data (FacilitatorGuideExtraction): The extracted data instance containing course details.

    Returns:
        List[Dict]: A list of dictionaries containing retrieved content and associated abilities.
    """
    retrieved_content = []
    for learning_unit in extracted_data.learning_units:
        for topic in learning_unit.topics:
            for knowledge in topic.tsc_knowledges:
                knowledge_id = knowledge.id
                knowledge_statement = knowledge.text
                topic_name = topic.name
                # Query the index to retrieve topic content for this Knowledge Statement
                response = engine.query(
                    f"Retrieve the most relevant inline segments aligned to Knowledge Statement: {knowledge_statement}\n"
                    f"From the given Topic: {topic_name}"        
                )
                # pprint_response(response, show_source=True)

                # Add the structured data using Pydantic model
                try:
                    retrieved_content.append({
                        "knowledge_id": knowledge_id,
                        "knowledge_statement": knowledge_statement,
                        "retrieved_content": response.response
                    })
                except Exception as e:
                    print(f"Error adding structured data for {knowledge_id}: {e}")
    return retrieved_content

In [39]:
openai_api_key = llm_config["config_list"][0]["api_key"]

system_prompt = """\
You are a content retrieval assistant. Your role is to retrieve topic content that aligns strictly with the specified Knowledge Statement.

Your role:
1. Restrict your retrieval strictly to the specified topic provided in the query.
2. Retrieve content from the topic that directly aligns with the provided Knowledge Statement.
3. If no specific content directly aligns with the Knowledge Statement, provide a general summary of the specified topic instead.
4. Include any example/usecase code or equations relevant to the topic or subtopics.
5. Prioritize retrieving content that are theoretical.
6. Identify and extract the exact inline segments from the provided documents that directly correspond to the content used to generate the given answer. The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text in the provided documents.

Ensure that:
- (Important) Each retrieved segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the Knowledge Statement is clear and directly supports the summary provided.
- (Important) If you didn't used the specific document or topic, do not mention it.
- If no relevant information is found for the Knowledge Statement, clearly state this and provide a general topic summary instead.

Restrictions:
- Do not include content from other topics or slides outside the specified topic.
- Each retrieved segment must explicitly belong to the given topic.
- Avoid including assumptions or content outside the scope of the Knowledge Statement.

You must always provide:
1. The retrieved content aligned with the Knowledge Statement.
2. A list of verbatim extracted segments that directly support the retrieved content, each labeled with the topic and document it belongs to.
"""
ks_generation_llm = llama_openai(model="gpt-4o-mini", api_key=openai_api_key, system_prompt=system_prompt)
ks_generation_query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=ks_generation_llm,
    response_mode="tree_summarize"
    # response_mode="compact",
    # node_postprocessors=[cohere_rerank],
)
retrieved_content = retrieve_content_for_knowledge_statement(extracted_data, ks_generation_query_engine)


In [83]:
pprint.pprint(retrieved_content)

[{'knowledge_id': 'K1',
  'knowledge_statement': 'Process characterization',
  'retrieved_content': 'No specific content directly aligns with the Knowledge '
                       'Statement "Process characterization" from the provided '
                       'documents. \n'
                       '\n'
                       '**General Summary of the Topic: Fundamentals of Design '
                       'of Experiment (K1, A1):**\n'
                       'The Fundamentals of Design of Experiment (DOE) involve '
                       'systematic methods for planning, conducting, '
                       'analyzing, and interpreting controlled tests to '
                       'evaluate the factors that may influence a particular '
                       'outcome or process. Key concepts include:\n'
                       '\n'
                       '1. **Process Characterization**: Understanding the '
                       'relationships between input variables (factors) and '
   

In [40]:
user_proxy_agent = UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",   
    max_consecutive_auto_reply=5,
    is_termination_msg=lambda msg: msg.get("content", "") and "TERMINATE" in msg["content"],
    code_execution_config={"work_dir": "output", "use_docker": False, "response_format": {"type": "json_object"}}
)

# Autogen setup
qa_generation_agent = AssistantAgent(
    name="question_answer_generator",
    system_message=f"""
    You are an expert educator in '{extracted_data.course_title}'. 
    You will create knowledge-based scenario question-answer pairs based on the retrieved content.

    ### Instructions:
    1. Generate **exactly one** question-and-answer pair per knowledge statement.
    2. **Scenario Requirements**:
       - Provide a **2–3 sentence** realistic scenario that offers context.
       - Describe an industry, organization, or setting related to the knowledge statement.
       - Include a brief goal or problem the scenario is trying to solve.
       - Write it in an **"analyze and recommend"** style, aligned with Bloom's Taxonomy Level 3 (Applying).
       - Example: 
         "An eCommerce company is considering AI applications but is unsure of the specific areas where AI could make the most impact. Analyze and recommend areas within the eCommerce industry where AI is most applicable."
    3. **Question Requirements**:
       - Phrase the question so it aligns with the scenario and Bloom's Taxonomy Level 3 (Applying).
       - Example:
         "Which AI applications can be implemented for maximum impact, and how should they be prioritized?"
    4. **Answer Requirements**:
       - Short Answer Question (SAQ) style.
       - Provide 3–4 concise bullet points **only** (no extra commentary).
       - Each bullet should be a key knowledge point or practical insight.
    5. **If no relevant content** is found for a given knowledge statement:
       - "scenario": "No relevant scenario found."
       - "question_statement": "No relevant content found for this knowledge statement."
       - "answer": ["No relevant content found."]
    6. Structure the final output in **valid JSON** with the format:

    ```json
    {{
        "course_title": "<course_title_here>",
        "duration": "<assessment_duration>",
        "questions": [
            {{
                "scenario": "<scenario>",
                "question_statement": "<question>",
                "knowledge_id": "<knowledge_id>",
                "answer": [
                    "<bullet_point_1>",
                    "<bullet_point_2>",
                    "<bullet_point_3>"
                ]
            }},
            ...
        ]
    }}
    ```
    
    7. Return the JSON between triple backticks followed by 'TERMINATE'.
    """,
    llm_config=llm_config,
)

assessment_duration = ""
for assessment in extracted_data.assessments:
    if "SAQ" in assessment.code:
        assessment_duration = assessment.duration
saq_context = {}
with Cache.disk() as cache:
    chat_result = user_proxy_agent.initiate_chat(
        qa_generation_agent,
        message=f"""
        Please generate the questions and answers using the following course title: '{extracted_data.course_title}', 
        assessment duration: '{assessment_duration}', and topic contents: {retrieved_content}.
        Ensure suggestive answers are provided as bullet points, concise and practical, covering key aspects of the knowledge statement.
        Phrase questions in alignment with Bloom's Taxonomy Level: {extracted_data.tsc_proficiency_level}.
        Bloom's Taxonomy Levels:
            Level 1: Remembering
            Level 2: Understanding
            Level 3: Applying
            Level 4: Analyzing
            Level 5: Evaluating
            Level 6: Creating
        Return the output in JSON string format with specific detailed answers as bullet points.
        RETURN 'TERMINATE' once the generation is complete.
        """,
        summary_method="reflection_with_llm",
        cache=cache,
)
try:
    last_message_content = chat_result.chat_history[-1].get("content", "")
    if not last_message_content:
        print("No content found in the agent's last message.")
    last_message_content = last_message_content.strip()
    json_pattern = re.compile(r'```json\s*(\{.*?\})\s*```', re.DOTALL)
    json_match = json_pattern.search(last_message_content)
    if json_match:
        json_str = json_match.group(1)
        context = json.loads(json_str)
        saq_context = context
        print(f"CONTEXT JSON MAPPING: \n\n{context}")
    pprint.pprint(f"SAQ cost: {chat_result.cost}")
except json.JSONDecodeError as e:
    print(f"Error parsing context JSON: {e}")


[33muser_proxy[0m (to question_answer_generator):


        Please generate the questions and answers using the following course title: 'Project Management with Generative AI (GenAI)', 
        assessment duration: '1 hr', and topic contents: [{'knowledge_id': 'K2', 'knowledge_statement': 'Building project master plans', 'retrieved_content': 'To build project master plans, the following key components and processes are essential:\n\n1. **Develop Project Management Plan**: This involves creating a comprehensive plan that outlines how the project will be executed, monitored, and controlled. It serves as a roadmap for the project team.\n\n2. **Plan Scope Management**: This includes defining and controlling what is included and excluded in the project. It ensures that all necessary work is included to complete the project successfully.\n\n3. **Plan Schedule Management**: This process involves determining the policies, procedures, and documentation for planning, developing, managing, exec

In [41]:
import os
import tempfile
from docxtpl import DocxTemplate

def _ensure_list(answer):
    """
    Returns a list regardless of whether 'answer' is originally a list, string, or None.
    """
    if isinstance(answer, list):
        return answer
    elif isinstance(answer, str):
        # Wrap the string in a list so Jinja loops or docxtpl loops won't break
        return [answer]
    return []

def generate_documents(context: dict, assessment_type: str, output_dir: str) -> dict:
    """
    Generate the question paper and answer paper for the given context and type.

    Parameters:
    - context (dict): The data for the assessment (course title, type, questions, etc.).
    - assessment_type (str): The assessment type (e.g., 'Ability-based', 'Knowledge-based').
    - output_dir (str): Directory where the generated documents will be saved.

    Returns:
    - dict: Paths to the generated documents (question and answer papers).
    """

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load template paths
    TEMPLATES = {
        "QUESTION": f"../Templates/(Template) {assessment_type} - Course Title - v1.docx",
        "ANSWER": f"../Templates/(Template) Answer to {assessment_type} - Course Title - v1.docx"
    }


    qn_template = TEMPLATES["QUESTION"]
    ans_template = TEMPLATES["ANSWER"]

    question_doc = DocxTemplate(qn_template)
    answer_doc = DocxTemplate(ans_template)
    
    # Prepare context for the answer paper:
    # Convert each question's 'answer' to a list if it's a string or None
    answer_context = {
        **context,
        "questions": [
            {
                **question,
                "answer": _ensure_list(question.get("answer"))
            }
            for question in context.get("questions", [])
        ]
    }

    # Prepare context for the question paper by creating a copy of the context without answers
    question_context = {
        **context,
        "questions": [
            {
                **question,
                "answer": None  # Remove answers for the question doc
            }
            for question in context.get("questions", [])
        ]
    }

    # Render both templates
    # IMPORTANT FIX: Use answer_context instead of context for the answer doc
    answer_doc.render(answer_context)
    question_doc.render(question_context)

    # Create temporary files for the question and answer documents
    question_tempfile = tempfile.NamedTemporaryFile(
        delete=False, suffix=f"_{assessment_type} - {context.get('course_title')} - v1.docx"
    )
    answer_tempfile = tempfile.NamedTemporaryFile(
        delete=False, suffix=f"Answer to {assessment_type} - {context.get('course_title')} - v1.docx"
    )

    # Save the rendered documents to the temporary files
    question_doc.save(question_tempfile.name)
    answer_doc.save(answer_tempfile.name)

    return {
        "ASSESSMENT_TYPE": assessment_type,
        "QUESTION": question_tempfile.name,
        "ANSWER": answer_tempfile.name
    }


In [42]:
files = generate_documents(
    context=saq_context, 
    assessment_type="WA (SAQ)",
    output_dir="output"
)

print(f"Generated documents: {files}")

Generated documents: {'ASSESSMENT_TYPE': 'WA (SAQ)', 'QUESTION': 'C:\\Users\\dljh1\\AppData\\Local\\Temp\\tmptbzljew8_WA (SAQ) - Project Management with Generative AI (GenAI) - v1.docx', 'ANSWER': 'C:\\Users\\dljh1\\AppData\\Local\\Temp\\tmpifkrivg0Answer to WA (SAQ) - Project Management with Generative AI (GenAI) - v1.docx'}


## Practical Performance (PP) Scenario Generation

In this section, we generate a practical performance (PP) scenario based on the extracted data from the Facilitator Guide (FG) document. The process involves the following steps:

1. **Generate a Realistic Scenario**:
    - The function `generate_pp_scenario` creates a concise, realistic scenario for the practical performance assessment.
    - The scenario aligns with the course title, learning outcomes, and abilities.

2. **Retrieve Content for Learning Outcomes**:
    - The `retrieve_content_for_learning_outcomes` function retrieves content related to the learning outcomes and abilities from the provided data using the query engine.

3. **Generate Questions and Answers**:
    - The `qa_generation_agent` generates practical performance question-answer pairs based on the scenario and retrieved content.
    - The questions describe hands-on tasks or practical activities related to the scenario.
    - The answers provide the exact solutions or outputs for the tasks.

4. **Output Format**:
    - The generated questions and answers are structured in a JSON format, ensuring they are ready for integration into assessment documents.

5. **Assessment Duration**:
    - The duration for the practical performance assessment is extracted from the provided data.

6. **Execution**:
    - The `user_proxy_agent` initiates the chat with the `qa_generation_agent` to generate the questions and answers based on the scenario and retrieved content.

In [101]:
class FacilitatorGuideExtraction(BaseModel):
    course_title: str
    tsc_proficiency_level: str
    learning_units: List[LearningUnit]
    assessments: List[AssessmentMethod]

def generate_pp_scenario(data: FacilitatorGuideExtraction, engine) -> str:
    """
    Generates a concise, realistic scenario for the practical performance.
    Args:
        course_title (str): The title of the course.
        learning_outcomes (List[str]): A list of learning outcomes.

    Returns:
        str: A concise scenario for the case study.
    """
    
    # Retrieve the course title and bloom taxonomy level
    course_title = data.course_title
    bloom_taxonomy_level = data.tsc_proficiency_level

    # Extract the learning outcomes as a list of strings
    learning_outcomes = [lu.learning_outcome for lu in data.learning_units]
    abilities = [ability.text for lu in data.learning_units for topic in lu.topics for ability in topic.tsc_abilities]
    
    outcomes_text = "\n".join([f"- {lo}" for lo in learning_outcomes])
    abilities_text = "\n".join([f"- {ability}" for ability in abilities])

    prompt = (
        f"You are tasked with designing a realistic practical performance assessment scenario for the course '{course_title}'.\n\n"
        f"The scenario should align with the following:\n\n"
        f"Learning Outcomes:\n{outcomes_text}\n\n"
        f"Abilities:\n{abilities_text}\n\n"
        f"Bloom's Taxonomy Level:\n{bloom_taxonomy_level}\n\n"
        "The scenario should describe a company or organization facing practical challenges and provide context for the learners to apply their skills.\n"
        "Ensure the scenario is concise (1 paragraph), realistic, and action-oriented, focusing on the summary of tasks learners must perform without requiring extensive deliverables."
    )
    response = engine.query(prompt)
    return response.response.strip()

def retrieve_content_for_learning_outcomes(extracted_data, engine):
    """
    Retrieves content related to the learning outcomes and abilities from the provided data.

    Args:
        extracted_data (FacilitatorGuideExtraction): The extracted data instance containing course details.

    Returns:
        List[Dict]: A list of dictionaries containing retrieved content and associated abilities.
    """
    retrieved_content = []

    for learning_unit in extracted_data.learning_units:
        learning_outcome = learning_unit.learning_outcome
        associated_abilities = []
        ability_ids = []
        for topic in learning_unit.topics:
            associated_abilities.extend(topic.tsc_abilities)
            ability_ids.extend([ability.id for ability in topic.tsc_abilities])

        # Define the content retrieval prompt
        retrieval_prompt = (
            f"Retrieve the most relevant inline segments aligned to Learning Outcome: {learning_outcome}\n"
            f"Associated Abilities:\n"
            + "\n".join([f"- [{ability.id}] {ability.text}" for ability in associated_abilities])
            + f"\nFrom the given Topics: {', '.join([topic.name for topic in learning_unit.topics])}"
        )

        response = engine.query(retrieval_prompt)
        retrieved_content.append({
            "learning_outcome": learning_outcome,
            "abilities": ability_ids,
            "retrieved_content": response.response
        })
    
    return retrieved_content

In [103]:
import os
import re
import json
import pprint
from autogen import AssistantAgent, UserProxyAgent
from autogen.cache import Cache
from llama_index.llms.openai import OpenAI as llama_openai


openai_api_key = OPENAI_API_KEY
system_prompt = """\
You are an instructional design assistant tasked with generating concise, realistic, and practical scenario-based question-answer pairs for educational purposes.

Your role:
1. **Generate a real-world scenario** for the given Course Title and Learning Outcome (LO). The scenario must:
- Be concise (1-2 paragraphs) while clearly describing the organizational challenges or context.
- Align directly with the Learning Outcome and be applicable to the associated abilities.
- Highlight specific organizational data, challenges, and objectives to ensure relevance and practicality.

2. Use only the information relevant to the specified Learning Unit, Learning Outcome, and its abilities. Do not include information from unrelated topics.

3. Ensure that:
- Each scenario and question-answer pair is realistic, aligned to Bloom's Taxonomy level for the LO, and practically applicable.
- If no relevant content exists, create a general scenario that remains educationally valuable and tied to the broader course theme.
    
**Output Format:**
- Tase study scenario have to be at least 500 words long.
- You will output your response in the following format. For example:
TechFusion, a leading software solutions provider, has been approached by a global bank to develop a new mobile banking application. The bank wants to offer its customers a seamless, secure, and intuitive banking experience on their smartphones. Given the competitive landscape, the bank emphasizes the need for rapid delivery without compromising on quality. TechFusion has recently adopted Agile methodologies with Scrum and DevOps practices and sees this project as an opportunity to showcase its capabilities in these areas.

**Restrictions:**
- Do not include content from other topics or unrelated slides.
- Do not invent abilities or knowledge outside the scope of the LO and its associated abilities.
"""

scenario_llm = llama_openai(model="gpt-4o-mini", api_key=openai_api_key, system_prompt=system_prompt)

scenario_query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=scenario_llm,
    response_mode="compact",
)
# Generate the shared scenario
scenario = generate_pp_scenario(extracted_data, scenario_query_engine)

system_prompt = """\
You are a content retrieval assistant. Your role is to retrieve topic content that aligns strictly with the specified Learning Outcome (LO) and its associated abilities.

Your role:
1. Restrict your retrieval strictly to the specified topic provided in the query.
2. Retrieve content from the topic that directly aligns with the provided Learning Outcome (LO) and its abilities.
3. If no specific content directly aligns with the Learning Outcome or abilities, provide a general summary of the topic instead.
4. Include any example/usecase code or equations relevant to the topic or subtopics.
5. Prioritize retrieving content that are practical.
6. Identify and extract the exact inline segments from the provided documents that directly correspond to the content used to generate the summary. The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text in the provided documents.

Ensure that:
- (Important) Each retrieved segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the Learning Outcome or abilities is clear and directly supports the summary provided.
- (Important) If you didn't use the specific document or topic, do not mention it.
- If no relevant information is found for the Learning Outcome, clearly state this and provide a general topic summary instead.

Restrictions:
- Do not include content from other topics or slides outside the specified topic.
- Each retrieved segment must explicitly belong to the given topic.
- Avoid including assumptions or content outside the scope of the Learning Outcome and abilities.

You must always provide:
1. The retrieved content aligned with the Learning Outcome and abilities.
2. A list of verbatim extracted segments that directly support the retrieved content, each labeled with the topic and document it belongs to.
"""

lo_retriever_llm = llama_openai(model="gpt-4o-mini", api_key=openai_api_key, system_prompt=system_prompt)
qa_generation_query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=lo_retriever_llm,
    response_mode="tree_summarize",
)
retrieved_content = retrieve_content_for_learning_outcomes(extracted_data, qa_generation_query_engine)
pprint.pprint(retrieved_content)


[{'abilities': ['A1'],
  'learning_outcome': 'Analyze process interactions to characterize key '
                      'factors influencing performance in Design of '
                      'Experiments (DoE).',
  'retrieved_content': 'No specific content related to the Learning Outcome of '
                       'analyzing process interactions to characterize key '
                       'factors influencing performance in Design of '
                       'Experiments (DoE) was found in the provided '
                       'documents. \n'
                       '\n'
                       'Here is a general summary of the topic:\n'
                       '\n'
                       '**Design of Experiments (DoE)** is a statistical '
                       'approach used to plan, conduct, and analyze controlled '
                       'tests to evaluate the factors that may influence a '
                       'particular outcome. The fundamental goal of DoE is to '
               

In [104]:
user_proxy_agent = UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=5,
    is_termination_msg=lambda msg: msg.get("content", "") and "TERMINATE" in msg["content"],
    code_execution_config={"work_dir": "output", "use_docker": False, "response_format": {"type": "json_object"}}
)

In [105]:
qa_generation_agent = AssistantAgent(
    name="question_answer_generator",
    system_message=f"""
    You are an expert educator in '{extracted_data.course_title}'. You will create scenario-based practical performance assessment (PPA) question-answer pairs based on course data.
    The data will include:
    - A scenario
    - Retrieved content aligned with learning outcomes and abilities

    ### Instructions:
    1. Use the provided scenario and retrieved content to generate **one practical question-answer pair per learning outcome.**

    2. **Question Requirements:**
    - Each question must describe a hands-on task or practical activity related to the scenario.
    - Ensure the task can be performed based on the scenario's context without assuming learners have additional information.
    - For coding courses, specify programming tasks with clear starting points. For non-coding courses, describe actions such as planning, decision-making, evaluating data, or implementing processes.
    - Ensure the question ends with "Take snapshots of your commands at each step and paste them below."

    3. **Answer Requirements:**
    - **Provide the exact solutions** (e.g., final code, commands, outputs, or tangible deliverables) rather than describing how to capture the snapshots.
    - If there is relevant text or direct quotes from the retrieved content, include them verbatim with proper citations, for example, "(Source: [retrieved_content_reference])".
    - Each answer should contain only the final (correct) output or solution. Avoid step-by-step instructions on capturing or documenting the process.

    4. Structure the final output in valid JSON with the following format:
    
    ```json
    {{
        "course_title": "<course_title_here>",
        "duration": "<assessment_duration_here>",
        "scenario": "<scenario_here>",
        "questions": [
            {{
                "question_statement": "<question_text>",
                "answer": ["<list_of exact final solutions or outputs>"],
                "ability_id": ["<list_of_ability_ids>"]
            }},
            ...
        ]
    }}
    ```
    5. Ensure the generated practical tasks align strictly with the retrieved content and abilities. If there is relevant text or direct quotes from the retrieved content, include them verbatim. Use a format like “(Source: [retrieved_content_reference])” where needed.
    6. Return the JSON between triple backticks followed by 'TERMINATE'.
    """,
    llm_config=llm_config,
)




## Experimental Retrieval with Citation

In this section, we perform experimental retrieval of content with proper citations. The process involves the following steps:

1. **Define Retrieval Prompt**:
    - Create a prompt to retrieve content aligned with specific learning outcomes and abilities.

2. **Execute Retrieval**:
    - Use the query engine to retrieve relevant content based on the defined prompt.

3. **Cite Sources**:
    - Ensure that all retrieved content includes proper citations to the original sources.

4. **Output Format**:
    - Structure the retrieved content and citations in a clear and organized format for easy reference and validation.

This approach ensures that the retrieved content is both relevant and properly attributed, maintaining academic integrity and providing clear references for further study.

In [None]:
qa_generation_agent = AssistantAgent(
    name="question_answer_generator",
    system_message=f"""
    You are an expert educator in '{extracted_data.course_title}'. You will create scenario-based practical performance assessment (PPA) question-answer pairs based on course data.
    The data will include:
    - A scenario
    - Retrieved content aligned with learning outcomes and abilities

    ### Instructions:
    1. Use the provided scenario and retrieved content to generate **one practical question-answer pair per learning outcome.**

    2. **Question Requirements:**
    - Each question must describe a hands-on task or practical activity related to the scenario.
    - Ensure the task can be performed based on the scenario's context without assuming learners have additional information.
    - For coding courses, specify programming tasks with clear starting points. For non-coding courses, describe actions such as planning, decision-making, evaluating data, or implementing processes.
    - Ensure the question ends with "Take snapshots of your commands at each step and paste them below."

    3. **Answer Requirements:**
    - The answer must explicitly describe the step-by-step practical actions the learner must perform to produce the snapshots.
    - The steps must:
        - Clearly specify **what learners should do** (e.g., commands to execute, actions to take, or configurations to make).
        - **Describe the output or results** learners will observe and capture in their snapshots.
        - Align strictly with the question's task and scenario.
    - For coding tasks:
        - Include steps such as writing and executing code, along with descriptions of expected outputs or configurations.
        - Example: "The snapshot should include: (1) A screenshot of the Python code editor with your script, (2) The terminal output showing the API response, and (3) A log of any errors encountered during execution."
    - For management or industry tasks:
        - Include steps for creating tangible deliverables such as project plans, reports, or visualizations.
        - Example: "The snapshot should include: (1) A screenshot of the completed project timeline, (2) A visualization of key performance indicators (e.g., chart or table), and (3) A summary report of key decisions."
    - Avoid theoretical explanations, general advice, or abstract concepts. The answer must focus exclusively on describing concrete, observable outputs aligned with the scenario's context.

    4. Structure the final output in valid JSON with the following format:
    
json
    {{
        "course_title": "<course_title_here>",
        "duration": "<assessment_duration_here>",
        "scenario": "<scenario_here>",
        "questions": [
            {{
                "question_statement": "<question_text>",
                "answer": ["<list_of_practical_steps describing the snapshot answer>"],
                "ability_id": ["<list_of_ability_ids>"]
            }},
            ...
        ]
    }}

    5. Ensure the generated practical tasks align strictly with the retrieved content and abilities. 
       - **If there is relevant text or direct quotes from the retrieved content, include them verbatim.** 
       - If you include a direct citation, use a format like "(Source: [retrieved_content_reference])" to clearly indicate its origin.
    6. Return the JSON between triple backticks followed by 'TERMINATE'.
    """,
    llm_config=llm_config,
)



[33muser_proxy[0m (to question_answer_generator):


        Please generate the questions and answer using the following course title:'Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini', assessment_duration:'', scenario: '**Scenario:**

TechSolutions, a mid-sized technology firm specializing in software development, is facing challenges in optimizing its customer support operations. The company has recently integrated a Generative AI (GAI) system powered by Google Gemini to enhance its customer interaction capabilities. However, the management is unsure about the effectiveness of the AI in handling customer queries and its impact on overall operational efficiency. As part of a practical performance assessment, learners are tasked with analyzing the range of LLM applications utilized by TechSolutions, identifying their strengths and limitations in the context of customer support. They must establish the design of the Google Gemini GAI syste

In [108]:
assessment_duration = ""
for assessment in extracted_data.assessments:
    if "PP" in assessment.code:
        assessment_duration = assessment.duration

with Cache.disk() as cache:
    chat_result = user_proxy_agent.initiate_chat(
        qa_generation_agent,
        message=f"""
        Please generate the questions and answer using the following course title:'{extracted_data.course_title}', assessment_duration:'{assessment_duration}', scenario: '{scenario}' and topic contents:{retrieved_content}
        Phrase your question in accordance with the Bloom's Taxonomy Level: {extracted_data.tsc_proficiency_level}
        Bloom's Taxonomy Level Information:
            Level 1: Remembering
            Level 2: Understanding
            Level 3: Applying
            Level 4: Analyzing
            Level 5: Evaluating
            Level 6: Creating
        Ensure the question ends with "Take snapshots of your commands at each step and paste them below."
        Ensure the answer begins with "The snapshot should include: " and specifies only practical steps to test hands-on skills without any writing or documenting.
        RETURN 'TERMINATE' once the generation is done.
        """,
        summary_method="reflection_with_llm",
        cache=cache,
)
try:
    last_message_content = chat_result.chat_history[-1].get("content", "")
    if not last_message_content:
        print("No content found in the agent's last message.")
    last_message_content = last_message_content.strip()
    json_pattern = re.compile(r'```json\s*(\{.*?\})\s*```', re.DOTALL)
    json_match = json_pattern.search(last_message_content)
    if json_match:
        json_str = json_match.group(1)
        context = json.loads(json_str)
        pprint.pprint(f"CONTEXT JSON MAPPING: \n\n{context}")
    pprint.pprint(f"CS cost: {chat_result.cost}")
except json.JSONDecodeError as e:
    print(f"Error parsing context JSON: {e}")

[33muser_proxy[0m (to question_answer_generator):


        Please generate the questions and answer using the following course title:'Practical Design of Experiment (DoE) for Engineers and Researchers', assessment_duration:'', scenario: '**Scenario:**

InnovateTech, a mid-sized manufacturing company specializing in electronic components, is experiencing a decline in product quality and an increase in production costs. The management has identified that several factors, including machine calibration, raw material quality, and operator training, may be influencing the performance of their production processes. To address these issues, the company has decided to implement a Design of Experiments (DoE) approach to systematically analyze these interactions. As part of a team of engineers and researchers, you are tasked with analyzing the process interactions to identify key factors affecting performance, selecting suitable factorial DoE projects based on relevant performance metrics, def

In [109]:
from docxtpl import DocxTemplate
import tempfile

def generate_documents(context: dict, assessment_type: str, output_dir: str) -> dict:
    """
    Generate the question paper and answer paper for the given context and type.

    Parameters:
    - context (dict): The data for the assessment (course title, type, questions, etc.).
    - assessment_type (str): The assessment type (e.g., 'Ability-based', 'Knowledge-based').
    - output_dir (str): Directory where the generated documents will be saved.

    Returns:
    - dict: Paths to the generated documents (question and answer papers).
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    # Load templates
    TEMPLATES = {
        "QUESTION": f"C:/Users/dljh1/Documents/courseware_autogen/Assessment/Templates/(Template) {assessment_type} - Course Title - v1.docx",
        "ANSWER": f"C:/Users/dljh1/Documents/courseware_autogen/Assessment/Templates/(Template) Answer to {assessment_type} - Course Title - v1.docx"
        }

    qn_template = TEMPLATES["QUESTION"]
    ans_template = TEMPLATES["ANSWER"]
    question_doc = DocxTemplate(qn_template)
    answer_doc = DocxTemplate(ans_template)
    
    # Prepare context for the question paper by creating a copy of the context without answers
    question_context = {
        **context,
        "questions": [
            {**question, "answer": None} for question in context.get("questions", [])
        ]
    }


    # Render both templates
    answer_doc.render(context)  # Render with answers
    question_doc.render(question_context)  # Render without answers

    # Create temporary files for the question and answer documents
    question_tempfile = tempfile.NamedTemporaryFile(
        delete=False, suffix=f"_{assessment_type}_Questions.docx"
    )
    answer_tempfile = tempfile.NamedTemporaryFile(
        delete=False, suffix=f"_{assessment_type}_Answers.docx"
    )

    # Save the rendered documents to the temporary files
    question_doc.save(question_tempfile.name)
    answer_doc.save(answer_tempfile.name)

    return {
        "ASSESSMENT_TYPE": assessment_type,
        "QUESTION": question_tempfile.name,
        "ANSWER": answer_tempfile.name
    }

In [110]:
files = generate_documents(
    context=context, 
    assessment_type="PP",
    output_dir="output"
)

print(f"Generated documents: {files}")

Generated documents: {'ASSESSMENT_TYPE': 'PP', 'QUESTION': 'C:\\Users\\dljh1\\AppData\\Local\\Temp\\tmpos07pia9_PP_Questions.docx', 'ANSWER': 'C:\\Users\\dljh1\\AppData\\Local\\Temp\\tmpyj4nc5bw_PP_Answers.docx'}


In [50]:
# Generate a detailed scenario for the case study
def generate_case_study_scenario(data: FacilitatorGuideExtraction, engine) -> str:
    """
    Generates a concise, realistic scenario for the case study.
    Args:
        course_title (str): The title of the course.
        learning_outcomes (List[str]): A list of learning outcomes.

    Returns:
        str: A concise scenario for the case study.
    """
    
    # Retrieve the course title and bloom taxonomy level
    course_title = data.course_title
    bloom_taxonomy_level = data.tsc_proficiency_level

    # Extract the learning outcomes as a list of strings
    learning_outcomes = [lu.learning_outcome for lu in data.learning_units]
    abilities = [ability.text for lu in data.learning_units for topic in lu.topics for ability in topic.tsc_abilities]
    
    outcomes_text = "\n".join([f"- {lo}" for lo in learning_outcomes])
    abilities_text = "\n".join([f"- {ability}" for ability in abilities])

    prompt = (
        f"You are tasked with designing a concise, realistic case study scenario for the course '{course_title}'.\n\n"
        f"The scenario should align with the following:\n\n"
        f"Learning Outcomes:\n{outcomes_text}\n\n"
        f"Abilities:\n{abilities_text}\n\n"
        f"Bloom's Taxonomy Level:\n{bloom_taxonomy_level}\n\n"
        "The scenario should describe a company or organization facing challenges related to communication, collaboration, or customer satisfaction. "
        "Ensure the scenario is realistic and practical, and keep it to 1-2 paragraphs without markdown elements or formatting."
    )
    response = engine.query(prompt)
    return response.response.strip()

def retrieve_content_for_learning_outcomes(extracted_data, engine):
    """
    Retrieves content related to the learning outcomes and abilities from the provided data.

    Args:
        extracted_data (FacilitatorGuideExtraction): The extracted data instance containing course details.

    Returns:
        List[Dict]: A list of dictionaries containing retrieved content and associated abilities.
    """
    retrieved_content = []

    for learning_unit in extracted_data.learning_units:
        learning_outcome = learning_unit.learning_outcome
        associated_abilities = []
        ability_ids = []
        for topic in learning_unit.topics:
            associated_abilities.extend(topic.tsc_abilities)
            ability_ids.extend([ability.id for ability in topic.tsc_abilities])

        # Define the content retrieval prompt
        retrieval_prompt = (
            f"Retrieve the most relevant inline segments aligned to Learning Outcome: {learning_outcome}\n"
            f"Associated Abilities:\n"
            + "\n".join([f"- [{ability.id}] {ability.text}" for ability in associated_abilities])
            + f"\nFrom the given Topics: {', '.join([topic.name for topic in learning_unit.topics])}"
        )

        response = engine.query(retrieval_prompt)
        retrieved_content.append({
            "learning_outcome": learning_outcome,
            "abilities": ability_ids,
            "retrieved_content": response.response
        })
    
    return retrieved_content

In [51]:
from llama_index.llms.openai import OpenAI as llama_openai
from autogen import AssistantAgent, UserProxyAgent
from autogen.cache import Cache
import streamlit as st
import json
import re
import pprint

openai_api_key = llm_config["config_list"][0]["api_key"]
system_prompt = """\
You are an instructional design assistant tasked with generating concise, realistic, and practical scenario-based question-answer pairs for educational purposes.

Your role:
1. **Generate a real-world scenario** for the given Course Title and Learning Outcome (LO). The scenario must:
- Be concise (1-2 paragraphs) while clearly describing the organizational challenges or context.
- Align directly with the Learning Outcome and be applicable to the associated abilities.
- Highlight specific organizational data, challenges, and objectives to ensure relevance and practicality.

2. Use only the information relevant to the specified Learning Unit, Learning Outcome, and its abilities. Do not include information from unrelated topics.

3. Ensure that:
- Each scenario and question-answer pair is realistic, aligned to Bloom's Taxonomy level for the LO, and practically applicable.
- If no relevant content exists, create a general scenario that remains educationally valuable and tied to the broader course theme.

**Output Format:**
- Tase study scenario have to be at least 500 words long.
- You will output your response in the following format. For example:
TechFusion, a leading software solutions provider, has been approached by a global bank to develop a new mobile banking application. The bank wants to offer its customers a seamless, secure, and intuitive banking experience on their smartphones. Given the competitive landscape, the bank emphasizes the need for rapid delivery without compromising on quality. TechFusion has recently adopted Agile methodologies with Scrum and DevOps practices and sees this project as an opportunity to showcase its capabilities in these areas.

**Restrictions:**
- Do not include content from other topics or unrelated slides.
- Do not invent abilities or knowledge outside the scope of the LO and its associated abilities.
"""


scenario_llm = llama_openai(model="gpt-4o-mini", api_key=openai_api_key, system_prompt=system_prompt)

scenario_query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=scenario_llm,
    response_mode="compact",
)
# Generate the shared scenario
scenario = generate_case_study_scenario(extracted_data, scenario_query_engine)

system_prompt = """\
You are a content retrieval assistant. Your role is to retrieve topic content that aligns strictly with the specified Learning Outcome (LO) and its associated abilities.

Your role:
1. Restrict your retrieval strictly to the specified topic provided in the query.
2. Retrieve content from the topic that directly aligns with the provided Learning Outcome (LO) and its abilities.
3. If no specific content directly aligns with the Learning Outcome or abilities, provide a general summary of the topic instead.
4. Include any example/usecase code or equations relevant to the topic or subtopics.
5. Prioritize retrieving content that are practical.
6. Identify and extract the exact inline segments from the provided documents that directly correspond to the content used to generate the summary. The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text in the provided documents.

Ensure that:
- (Important) Each retrieved segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the Learning Outcome or abilities is clear and directly supports the summary provided.
- (Important) If you didn't use the specific document or topic, do not mention it.
- If no relevant information is found for the Learning Outcome, clearly state this and provide a general topic summary instead.

Restrictions:
- Do not include content from other topics or slides outside the specified topic.
- Each retrieved segment must explicitly belong to the given topic.
- Avoid including assumptions or content outside the scope of the Learning Outcome and abilities.

You must always provide:
1. The retrieved content aligned with the Learning Outcome and abilities.
2. A list of verbatim extracted segments that directly support the retrieved content, each labeled with the topic and document it belongs to.
"""

lo_retriever_llm = llama_openai(model="gpt-4o-mini", api_key=openai_api_key, system_prompt=system_prompt)
qa_generation_query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=lo_retriever_llm,
    response_mode="tree_summarize",
)
retrieved_content = retrieve_content_for_learning_outcomes(extracted_data, qa_generation_query_engine)

In [52]:
pprint.pprint(retrieved_content)

[{'abilities': ['A1'],
  'learning_outcome': 'Develop project parameters with GenAI according to '
                      'business requirements',
  'retrieved_content': 'To align with the Learning Outcome of developing '
                       'project parameters with GenAI according to business '
                       'requirements, the following relevant content has been '
                       'extracted:\n'
                       '\n'
                       '### Retrieved Content:\n'
                       '1. **Develop Project Parameters**: The course '
                       'emphasizes the importance of developing project '
                       'parameters that align with business requirements. This '
                       'includes creating a project charter, identifying '
                       'stakeholders, and developing a project management plan '
                       'that encompasses various management aspects such as '
                       'scope, schedule, cos

In [53]:
import json

llm_config={
    "config_list": [
        {
            'model': "gpt-4o",
            'api_key': OPENAI_API_KEY,
        },
    ],
    "timeout": 300,
}

user_proxy_agent = UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=5,
    is_termination_msg=lambda msg: msg.get("content", "") and "TERMINATE" in msg["content"],
    code_execution_config={"work_dir": "output", "use_docker": False}
)

# Autogen setup
qa_generation_agent = AssistantAgent(
    name="question_answer_generator",
    system_message=f"""
    You are an expert educator in '{extracted_data.course_title}'. You will create scenario-based case study question-answer pairs based on course data.
    The data will include:
    - A scenario
    - Retrieved content aligned with learning outcomes and abilities

    ### Instructions:
    1. Use the provided scenario and retrieved content to generate one question-and-answer pairs per one learning outcome.
    2. Each question should be aligned with the learning outcome and abilities implied by the retrieved content and the Bloom's Taxonomy Level.
    3. The answer should demonstrate mastery of the abilities and address the scenario context.
    4. The **answer** must be in a **case study solution style**: a detailed solution or approach addressing the scenario's key challenges.
        - Explain what to do, why it matters, and any recommended tools or methods.
        - Provide a short rationale (“why”) for each recommended action, tying it back to the scenario’s goals.

    4. Ensure all keys and values are double-quoted in the JSON string output.
    5. Return the output in JSON in String format with the following structure:
    import
        ```
        {{
            "course_title": "<course_title_here>",
            "duration": "<assessment_duration_here>",
            "scenario": "<scenario_here>",
            "questions": [
            {{
                "question_statement": "<question_text>",
                "answer": "[<list_of_answer_text>]",
                "ability_id": ["<list_of_ability_ids>"]
            }},
            ...
            ]
        }}
        ```
    """,
    llm_config=llm_config,
)
assessment_duration = ""
for assessment in extracted_data.assessments:
    if "CS" in assessment.code:
        assessment_duration = assessment.duration

cs_context = {}
chat_result = user_proxy_agent.initiate_chat(
    qa_generation_agent,
    message=f"""
    Please generate the questions and answer using the following course title:'{extracted_data.course_title}', assessment_duration:'{assessment_duration}', scenario: '{scenario}' and topic contents:{retrieved_content}
    Phrase your question in accordance with the Bloom's Taxonomy Level: {extracted_data.tsc_proficiency_level}
    Bloom's Taxonomy Level Information:
        Level 1: Remembering
        Level 2: Understanding
        Level 3: Applying
        Level 4: Analyzing
        Level 5: Evaluating
        Level 6: Creating
    Return the question and answer as a JSON in String Format containing the specified fields.
    import
    ```
    {{
        "course_title": "<course_title_here>",
        "duration": "<assessment_duration_here>",
        "scenario": "<scenario_here>",
        "questions": [
        {{
            "question_statement": "<question_text>",
            "answer": "[<list_of_answer_text>]",
            "ability_id": ["<list_of_ability_ids>"]
        }},
        ...
        ]
    }}
    ```
    RETURN 'TERMINATE' once the generation is done.
    """,
    summary_method="reflection_with_llm",
)
try:
    last_message_content = chat_result.chat_history[-1].get("content", "")
    if not last_message_content:
        print("No content found in the agent's last message.")
    last_message_content = last_message_content.strip()
    json_pattern = re.compile(r'```json\s*(\{.*?\})\s*```', re.DOTALL)
    json_match = json_pattern.search(last_message_content)
    if json_match:
        json_str = json_match.group(1)
        context = json.loads(json_str)
        cs_context = context
        pprint.pprint(context)
    pprint.pprint(f"CS cost: {chat_result.cost}")
except json.JSONDecodeError as e:
    print(f"Error parsing context JSON: {e}")



[33muser_proxy[0m (to question_answer_generator):


    Please generate the questions and answer using the following course title:'Project Management with Generative AI (GenAI)', assessment_duration:'1 hr', scenario: 'TechSolutions, a mid-sized IT consulting firm, has recently adopted Generative AI (GenAI) to enhance its project management capabilities. However, the company is struggling with communication breakdowns between project teams and clients, leading to misaligned expectations and delayed deliverables. As a result, customer satisfaction has plummeted, and project budgets are being exceeded due to rework and inefficient resource allocation. The management team recognizes the need to develop clear project parameters that align with client requirements and to optimize resource usage through GenAI tools. They also aim to identify potential risks associated with project delivery and implement effective risk mitigation strategies to ensure successful project outcomes.

In this con

In [54]:
import os
import tempfile
from docxtpl import DocxTemplate

def _ensure_list(answer):
    """
    Returns a list regardless of whether 'answer' is originally a list, string, or None.
    """
    if isinstance(answer, list):
        return answer
    elif isinstance(answer, str):
        # Wrap the string in a list so Jinja loops or docxtpl loops won't break
        return [answer]
    return []

def generate_documents(context: dict, assessment_type: str, output_dir: str) -> dict:
    """
    Generate the question paper and answer paper for the given context and type.

    Parameters:
    - context (dict): The data for the assessment (course title, type, questions, etc.).
    - assessment_type (str): The assessment type (e.g., 'Ability-based', 'Knowledge-based').
    - output_dir (str): Directory where the generated documents will be saved.

    Returns:
    - dict: Paths to the generated documents (question and answer papers).
    """

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load template paths
    TEMPLATES = {
        "QUESTION": f"C:/Users/dljh1/Documents/courseware_autogen/Assessment/Templates/(Template) {assessment_type} - Course Title - v1.docx",
        "ANSWER": f"C:/Users/dljh1/Documents/courseware_autogen/Assessment/Templates/(Template) Answer to {assessment_type} - Course Title - v1.docx"
    }

    qn_template = TEMPLATES["QUESTION"]
    ans_template = TEMPLATES["ANSWER"]

    question_doc = DocxTemplate(qn_template)
    answer_doc = DocxTemplate(ans_template)
    
    # Prepare context for the answer paper:
    # Convert each question's 'answer' to a list if it's a string or None
    answer_context = {
        **context,
        "questions": [
            {
                **question,
                "answer": _ensure_list(question.get("answer"))
            }
            for question in context.get("questions", [])
        ]
    }

    # Prepare context for the question paper by creating a copy of the context without answers
    question_context = {
        **context,
        "questions": [
            {
                **question,
                "answer": None  # Remove answers for the question doc
            }
            for question in context.get("questions", [])
        ]
    }

    # Render both templates
    # IMPORTANT FIX: Use answer_context instead of context for the answer doc
    answer_doc.render(answer_context)
    question_doc.render(question_context)

    # Create temporary files for the question and answer documents
    question_tempfile = tempfile.NamedTemporaryFile(
        delete=False, suffix=f"_{assessment_type} - {context.get('course_title')} - v1.docx"
    )
    answer_tempfile = tempfile.NamedTemporaryFile(
        delete=False, suffix=f"Answer to {assessment_type} - {context.get('course_title')} - v1.docx"
    )

    # Save the rendered documents to the temporary files
    question_doc.save(question_tempfile.name)
    answer_doc.save(answer_tempfile.name)

    return {
        "ASSESSMENT_TYPE": assessment_type,
        "QUESTION": question_tempfile.name,
        "ANSWER": answer_tempfile.name
    }


In [55]:
cs_context = {
    "course_title": "Project Management with Generative AI (GenAI)",
    "duration": "1 hr",
    "scenario": "TechSolutions, a mid-sized IT consulting firm, has recently adopted Generative AI (GenAI) to enhance its project management capabilities. However, the company is struggling with communication breakdowns between project teams and clients, leading to misaligned expectations and delayed deliverables. As a result, customer satisfaction has plummeted, and project budgets are being exceeded due to rework and inefficient resource allocation. The management team recognizes the need to develop clear project parameters that align with client requirements and to optimize resource usage through GenAI tools. They also aim to identify potential risks associated with project delivery and implement effective risk mitigation strategies to ensure successful project outcomes. In this context, the project manager is tasked with leveraging GenAI to establish precise project parameters that reflect business needs, advising team members on overcoming delivery challenges, and proactively identifying risks that could impact project timelines and budgets. By utilizing GenAI, the manager seeks to create a more collaborative environment, streamline communication, and enhance overall project efficiency, ultimately restoring client trust and satisfaction.",
    "questions": [
        {
            "question_statement": "Evaluate how TechSolutions can develop project parameters using Generative AI to ensure they align with business requirements.",
            "answer": [
                "To develop project parameters that align with the business requirements using Generative AI, TechSolutions should first ensure the creation of a comprehensive Project Charter. This foundational document serves to clearly outline the project's objectives, scope, and the business rationale, aligning with strategic goals.",
                "Next, TechSolutions should engage key stakeholders, both internal and external, to gain a holistic understanding of the business requirements. This can be achieved by utilizing GenAI tools to analyze stakeholder inputs and generate insights on expectations and constraints.",
                "Subsequently, TechSolutions should leverage Generative AI to develop a Project Management Plan encompassing scope, schedule, cost, quality, and resources. GenAI can help model different scenarios to find the optimal set of project parameters that meet business requirements while maximizing efficiency.",
                "It's important to continuously update and refine these parameters as the project progresses and new information becomes available, using GenAI's predictive analytics capabilities to anticipate changes and adjust plans proactively.",
                "Aligning project parameters with genuine business needs is crucial as it ensures that the project delivers value to the organization and meets stakeholders' expectations, thereby restoring client trust and satisfaction."
            ],
            "ability_id": ["A1"]
        },
        {
            "question_statement": "Evaluate strategies that TechSolutions can use to resolve project delivery challenges and optimize resources and budgets with Generative AI.",
            "answer": [
                "TechSolutions can address project delivery challenges by implementing Generative AI-driven strategies for optimizing resource allocation and budget management. Firstly, the firm should enhance knowledge management to ensure all team members are aware of the available resources and how to leverage them effectively. This involves creating a centralized knowledge repository with GenAI to provide insights on resource utilization and availability.",
                "Leveraging GenAI for resource optimization techniques such as resource leveling and smoothing can minimize overload and ensure that resources are efficiently allocated based on real-time demands. This is essential for maintaining productivity without exhausting the team.",
                "Cost management is another critical area, where TechSolutions can employ GenAI to produce accurate cost estimates and continuously monitor actual expenditures against the budget. This allows for early detection of potential overruns and deviations, facilitating proactive adjustments.",
                "Furthermore, by engaging stakeholders through AI-driven communication tools, TechSolutions can better understand resource needs and budget constraints, allowing for informed decision-making and alignment with project goals.",
                "Optimizing resource allocation and managing budgets efficiently is key to overcoming project delivery challenges, reducing delays, and preventing budget overruns, ultimately contributing to successful project outcomes and enhanced customer satisfaction."
            ],
            "ability_id": ["A2"]
        },
        {
            "question_statement": "Evaluate how TechSolutions can anticipate project risks and devise effective mitigation strategies using Generative AI.",
            "answer": [
                "TechSolutions can anticipate project risks and devise mitigation strategies by using Generative AI to monitor for risk trigger conditions and predict potential impacts. GenAI can parse through vast volumes of data to identify both known and unknown risk factors, providing early warning signals.",
                "The company should implement a robust risk management process starting with the identification and documentation of potential risks. Generative AI can aid in creating a comprehensive risk register by analyzing historical project data and current project dynamics.",
                "Performing qualitative risk analysis using AI can help assess the likelihood and impact of each identified risk, allowing TechSolutions to prioritize risks based on their severity. GenAI can simulate different scenarios to evaluate how varying risk conditions might affect project delivery.",
                "In terms of risk responses, TechSolutions can use GenAI to develop strategic mitigation actions such as transferring risks, exploiting opportunities, or reducing the probability of threats. AI-enabled predictive models can test different mitigation strategies to choose the most effective responses.",
                "Continuous risk monitoring is essential, and Generative AI can automate this process by tracking risk statuses and identifying new risks as projects evolve. This proactive approach ensures that TechSolutions remains agile in its risk management, thereby safeguarding project outcomes and maintaining client confidence."
            ],
            "ability_id": ["A3", "A4"]
        }
    ]
}

files = generate_documents(
    context=cs_context, 
    assessment_type="CS",
    output_dir="output"
)

print(f"Generated documents: {files}")

Generated documents: {'ASSESSMENT_TYPE': 'CS', 'QUESTION': 'C:\\Users\\dljh1\\AppData\\Local\\Temp\\tmp153jv4mg_CS - Project Management with Generative AI (GenAI) - v1.docx', 'ANSWER': 'C:\\Users\\dljh1\\AppData\\Local\\Temp\\tmpro13zz9bAnswer to CS - Project Management with Generative AI (GenAI) - v1.docx'}
