In [1]:
from copy import deepcopy
from pathlib import Path
import os
import re
from copy import deepcopy
from pathlib import Path
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_parse import LlamaParse
from llama_index.core.schema import TextNode
from typing import Optional


In [2]:
import nest_asyncio

nest_asyncio.apply()

### Defining Pydantic Models for Structured Output

In this section, we define several Pydantic models to structure the output data. These models ensure that the extracted and generated data adheres to a predefined schema, facilitating validation and consistency.

- `KnowledgeStatement`: Represents a knowledge statement with an ID and text.
- `AbilityStatement`: Represents an ability statement with an ID and text.
- `Topic`: Represents a topic with a name, subtopics, knowledge statements, and ability statements.
- `LearningUnit`: Represents a learning unit with a name, topics, and a learning outcome.
- `AssessmentMethod`: Represents an assessment method with a code and duration.
- `FacilitatorGuideExtraction`: Represents the structured data extracted from the facilitator guide, including course title, proficiency level, learning units, and assessments.
- `KnowledgeStatementContent`: Represents the content retrieved for a knowledge statement, including the knowledge ID, statement, topic name, and retrieved content.
- `WSQ`: Represents a workplace scenario question, including the knowledge ID, statement, scenario, question, and answer.
- `CaseStudyQuestion`: Represents a case study question with a question, answer, and associated abilities.
- `CaseStudy`: Represents a case study with a scenario and a list of case study questions.

These models are used to structure and validate the data throughout the extraction and generation processes, ensuring that the output is consistent and reliable.

In [85]:
from llama_index.llms.openai import OpenAI
from pydantic.v1 import BaseModel, Field
from typing import List

# Define Pydantic models for structured output
class KnowledgeStatement(BaseModel):
    id: str
    text: str


class AbilityStatement(BaseModel):
    id: str
    text: str


class Topic(BaseModel):
    name: str
    subtopics: List[str]
    tsc_knowledges: List[KnowledgeStatement]
    tsc_abilities: List[AbilityStatement]


class LearningUnit(BaseModel):
    name: str
    topics: List[Topic]
    learning_outcome: str


class AssessmentMethod(BaseModel):
    code: str
    duration: str

class FacilitatorGuideExtraction(BaseModel):
    course_title: str
    tsc_proficiency_level: str
    learning_units: List[LearningUnit]
    assessments: List[AssessmentMethod]  # New field for assessments

# Define a Pydantic model for the Knowledge Statement and its Retrieved Content
class KnowledgeStatementContent(BaseModel):
    knowledge_id: str = Field(..., description="The ID of the Knowledge Statement, e.g., K1, K2.")
    knowledge_statement: str = Field(..., description="The text of the Knowledge Statement.")
    topic_name: str = Field(..., description="The name of the topic associated with this Knowledge Statement.")
    retrieved_content: str = Field(..., description="The content retrieved for this Knowledge Statement.")

# Define the WSQ model for structured output
class WSQ(BaseModel):
    knowledge_id: str = Field(..., description="The ID of the Knowledge Statement, e.g., K1, K2.")
    knowledge_statement: str = Field(..., description="The text of the Knowledge Statement.")
    scenario: str = Field(..., description="The realistic workplace scenario.")
    question: str = Field(..., description="The question based on the scenario.")
    answer: str = Field(..., description="The concise answer to the question.")


class CaseStudyQuestion(BaseModel):
    question: str
    answer: str
    abilities: List[AbilityStatement]

class CaseStudy(BaseModel):
    scenario: str
    questions: List[CaseStudyQuestion]

## Setting Up LlamaParse, LlamaIndex and OpenAI Models

In this section, we initialize the `Settings` for LlamaIndex with the OpenAI embedding and language models. We use the `OpenAIEmbedding` model for embeddings and the `OpenAI` model for language processing.

In [3]:
OPENAI_API_KEY = os.getenv('TERTIARY_INFOTECH_API_KEY') 
LLAMA_API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')

In [4]:
embed_model = OpenAIEmbedding(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
llm = OpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

Settings.embed_model = embed_model
Settings.llm = llm

In [None]:
parser = LlamaParse(
    result_type="markdown",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name="openai-gpt-4o-mini",
    invalidate_cache=True,
)

In [6]:
print(f"Parsing slide deck...")
md_json_objs = parser.get_json_result(r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\input\WSQ- Learner Guide Slides - Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini - v5.pdf")
md_json_list = md_json_objs[0]["pages"]

Parsing slide deck...
Started parsing the file under job_id 5a5afb7f-8b1c-489a-94c5-893a3d2e3c6d


In [7]:
print(md_json_list[10]["md"])

# Final Assessment
- Written Assessment (SAQ) - 1 hr
- Practical Performance (PP) - 1 hr


In [8]:
print(md_json_list[1].keys())

dict_keys(['page', 'md', 'images', 'charts', 'items', 'status', 'links', 'triggeredAutoMode', 'structuredData', 'noStructuredContent', 'noTextContent'])


In [46]:
print(md_json_list[0]['images'])

[{'name': 'page_1.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]


In [12]:
image_dicts = parser.get_images(md_json_objs, download_path="data_images")

> Image for page 1: [{'name': 'page_1.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 2: [{'name': 'page_2.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 3: [{'name': 'page_3.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 4: [{'name': 'page_4.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 5: [{'name': 'page_5.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 6: [{'name': 'page_6.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 7: [{'name': 'page_7.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 8: [{'name': 'page_8.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 9: [{'name': 'page_9.jpg', 'height': 0,

In [9]:
from llama_index.core.schema import TextNode
from typing import Optional

In [10]:
# get pages loaded through llamaparse
import re


def get_page_number(file_name):
    match = re.search(r"-page-(\d+)\.jpg$", str(file_name))
    if match:
        return int(match.group(1))
    return 0


def _get_sorted_image_files(image_dir):
    """Get image files sorted by page."""
    raw_files = [f for f in list(Path(image_dir).iterdir()) if f.is_file()]
    sorted_files = sorted(raw_files, key=get_page_number)
    return sorted_files

In [11]:
from copy import deepcopy
from pathlib import Path


# attach image metadata to the text nodes
def get_text_nodes(json_dicts, image_dir=None):
    """Split docs into nodes, by separator."""
    nodes = []

    image_files = _get_sorted_image_files(image_dir) if image_dir is not None else None
    md_texts = [d["md"] for d in json_dicts]

    for idx, md_text in enumerate(md_texts):
        chunk_metadata = {"page_num": idx + 1}
        if image_files is not None:
            image_file = image_files[idx]
            chunk_metadata["image_path"] = str(image_file)
        chunk_metadata["parsed_text_markdown"] = md_text
        node = TextNode(
            text="",
            metadata=chunk_metadata,
        )
        nodes.append(node)

    return nodes

In [12]:
# this will split into pages
text_nodes = get_text_nodes(md_json_list, image_dir="data_images")

In [13]:
print(text_nodes[10].get_content(metadata_mode="all"))

page_num: 11
image_path: data_images\ca20a00e-bea0-4d03-b623-d3591f78f92d-page_108.jpg
parsed_text_markdown: # Final Assessment
- Written Assessment (SAQ) - 1 hr
- Practical Performance (PP) - 1 hr


In [14]:
import os
from llama_index.core import (
    StorageContext,
    SummaryIndex,
    load_index_from_storage,
)

if not os.path.exists("storage_nodes_summary"):
    index = SummaryIndex(text_nodes)
    # save index to disk
    index.set_index_id("summary_index")
    index.storage_context.persist("./storage_nodes_summary")
else:
    # rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir="storage_nodes_summary")
    # load index
    index = load_index_from_storage(storage_context, index_id="summary_index")

### Extraction of FG Document Data

In this section, we focus on extracting structured data from the Facilitator Guide (FG) document. The process involves several key steps:

1. **Parsing the FG Document**: We utilize the `LlamaParse` tool to preprocess the content of the FG document. This involves reading the document and converting it into a structured format that can be easily processed.

2. **Loading Document Content**: The content of the FG document is loaded using the `Document` class from the `docx` module. This allows us to read the text content of the document and prepare it for further processing.

3. **Extracting Structured Data**: Using the OpenAI model, we extract relevant information from the preprocessed content. This includes details such as the course title, TSC proficiency level, learning units, topics, knowledge statements, ability statements, and assessment methods. The extracted data is validated against the `FacilitatorGuideExtraction` Pydantic model to ensure it adheres to the expected schema.

In [None]:
from pydantic import BaseModel
from openai import OpenAI
from docx import Document
from typing import Dict, List
from llama_parse import LlamaParse
import nest_asyncio
import os
import json

# API access to llama-cloud
LLAMA_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

# Using OpenAI API for embeddings/llms
OPENAI_API_KEY = os.getenv("TERTIARY_INFOTECH_API_KEY")

# Initialize the OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Function to load FG document content using LlamaParse
def parse_with_llamaparse(file_path: str) -> str:
    """
    Parses the FG document using LlamaParse to preprocess its content.

    Args:
        file_path (str): Path to the document.

    Returns:
        str: Preprocessed content of the document as text.
    """

    parser = LlamaParse(
        api_key=LLAMA_API_KEY,
        result_type="markdown",
        show_progress=True,
        verbose=True,
        num_workers=8
    )

    result = parser.get_json_result(file_path)
    return result

# Function to load FG document content
def load_fg_document(file_path: str) -> str:
    """
    Reads the content of a Word document.

    Args:
        file_path (str): Path to the document.

    Returns:
        str: The document's text content.
    """
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Function to extract structured data from FG document
def extract_fg_data(preprocessed_content: str) -> FacilitatorGuideExtraction:
    """
    Extracts relevant information from the FG document using GPT and Pydantic.

    Args:
        preprocessed_content (str): Preprocessed text content of the FG document.

    Returns:
        FacilitatorGuideExtraction: Extracted and validated data.
    """
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    """You are an expert at structured data extraction. Extract the following details from the FG Document:
                    - Course Title
                    - TSC Proficiency Level
                    - Learning Units (LUs):
                        * Name of the Learning Unit
                        * Topics in the Learning Unit:
                            - Name of the Topic
                            - Description of the Topic (bullet points or sub-topics)
                            - Full Knowledge Statements associated with the topic, including their identifiers and text (e.g., K1: Range of AI applications)
                            - Full Ability Statements associated with the topic, including their identifiers and text (e.g., A1: Analyze algorithms in the AI applications)
                        * Learning Outcome (LO) for each Learning Unit
                    - Assessment Types and Durations:
                        * Extract assessment types and their durations in the format:
                          {"code": "WA-SAQ", "duration": "1 hr"}
                          {"code": "PP", "duration": "0.5 hr"}
                          {"code": "CS", "duration": "30 mins"}
                        * Interpret abbreviations of assessment methods to their correct types (e.g., "WA-SAQ," "PP," "CS").
                        * Include total durations if mentioned.

                    Return the output in a JSON format that matches the schema provided:
                    {
                        "course_title": "string",
                        "tsc_proficiency_level": "string",
                        "learning_units": [
                            {
                                "name": "string",
                                "topics": [
                                    {
                                        "name": "string",
                                        "subtopics": ["string"],
                                        "tsc_knowledges": [
                                            {"id": "string", "text": "string"}
                                        ],
                                        "tsc_abilities": [
                                            {"id": "string", "text": "string"}
                                        ]
                                    }
                                ],
                                "learning_outcome": "string"
                            }
                        ],
                        "assessments": [
                            {"code": "string", "duration": "string"}
                        ]
                    }
                    """
                ),
            },
            {"role": "user", "content": preprocessed_content},
        ],
        response_format=FacilitatorGuideExtraction,
    )

    return completion.choices[0].message.parsed

# Main Execution
if __name__ == "__main__":
    # Path to the FG document
    file_path = r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\input\FG_TGS-2024042961_Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini_v2.docx"
    
    # Parse document content using LlamaParse
    try:
        parsed_content = parse_with_llamaparse(file_path)
        print("Preprocessed Content:")
        print(json.dumps(parsed_content, indent=4))
    except Exception as e:
        print(f"Error during LlamaParse parsing: {e}")
        exit(1)

    # Extract and validate structured data using OpenAI and Pydantic
    try:
        extracted_data = extract_fg_data(json.dumps(parsed_content))
        print("Structured Data:")
        print(extracted_data)
    except Exception as e:
        print(f"Error during OpenAI extraction: {e}")


Started parsing the file under job_id 70f99f31-3cfe-48e7-911d-d057469f6265
Preprocessed Content:
[
    {
        "pages": [
            {
                "page": 1,
                "text": "                                    T\n                              Facilitator Guide\n                                         For\n\nDevelop Artificial Intelligence and Large Language Model (LLM) Applications\n                                with Google Gemini\n\n                           TGS Ref No: TGS-2024042961\n\n                                   Conducted by\n                         TERTIARY INFOTECH PTE. LTD\n\n                                 UEN: 201200696W\n\n                                     Version 2.0",
                "md": "# Facilitator Guide\n\n# For\n\n# Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini\n\nTGS Ref No: TGS-2024042961\n\n# Conducted by\n\nTERTIARY INFOTECH PTE. LTD\n\nUEN: 201200696W\n\nVersion 2.0",
             

In [34]:
print(extracted_data)

course_title='Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini' tsc_proficiency_level='3' learning_units=[LearningUnit(name='Overview of Large Language Model (LLM)', topics=[Topic(name='Overview of Large Language Model (LLM)', subtopics=['What is Large Language Model (LLM)?', 'Opportunities LLM applications', 'Industrial use cases of LLM applications'], tsc_knowledges=[KnowledgeStatement(id='K1', text='Range of AI applications'), KnowledgeStatement(id='K6', text='Applicability of AI in the industry')], tsc_abilities=[AbilityStatement(id='A1', text='Analyse algorithms in the AI applications'), AbilityStatement(id='A3', text='Identify strengths and limitations of the AI applications')])], learning_outcome='Analyze the range of LLM applications using Generative AI (GAI) and identify their industrial use cases.'), LearningUnit(name='Multimodal Prompting with Google Gemini LLM', topics=[Topic(name='Multimodal Prompting with Google Gemini LLM', s

## Generating Case Study Scenario-based Questions (CS Scenario-based Questions)

In this section, we generate the Case Study assessment questions and answers. Each question is linked to a specific learning outcome and includes a question, an answer, and its associated ability statements, which are structured in a JSON format. The results are then saved to a file for further use.

### Retrieve the Learning Outcomes and its associated Ability Statements

In [36]:
def get_learning_outcomes_with_abilities(data: FacilitatorGuideExtraction):
    """
    Retrieves learning outcomes and their associated ability statements from the extracted data.

    Args:
        data (dict): The extracted data.

    Returns:
        List of dictionaries with learning outcomes and associated abilities.
    """
    result = []
    for unit in data.learning_units:
        learning_outcome = unit.learning_outcome
        abilities = []
        for topic in unit.topics:
            abilities.extend(topic.tsc_abilities)
        result.append({
            "learning_outcome": learning_outcome,
            "abilities": abilities
        })
    return result

mapped_data = get_learning_outcomes_with_abilities(extracted_data)

for entry in mapped_data:
    print(f"Learning Outcome: {entry['learning_outcome']}")
    print("Associated Abilities:")
    for ability in entry["abilities"]:
        print(f"- {ability.id}: {ability.text}")
    print()


Learning Outcome: Analyze the range of LLM applications using Generative AI (GAI) and identify their industrial use cases.
Associated Abilities:
- A1: Analyse algorithms in the AI applications
- A3: Identify strengths and limitations of the AI applications

Learning Outcome: Establish Google Gemini GAI designs and assess improvements on engineering processes.
Associated Abilities:
- A2: Establish the correlation between design of algorithms and efficiency
- A6: Assess improvements on the engineering and maintenance processes

Learning Outcome: Develop LLM applications and assess its feasibility.
Associated Abilities:
- A5: Assess feasibility of AI applications to the engineering processes

Learning Outcome: Evaluate the performance effectiveness of Retrieval Augmented Generation (RAG).
Associated Abilities:
- A4: Evaluate various AI applications to compare strengths and limitations of the AI applications



### Scenario Generation Agent
The Scenario Generation Agent is designed to create detailed, realistic scenarios based on specific educational inputs. By leveraging the provided Learning Outcomes, Course Title, and TSC Proficiency Level, the agent generates a comprehensive scenario that aligns with the educational goals and proficiency requirements. This scenario serves as a practical context for learners to apply their knowledge and skills, ensuring that the learning experience is both relevant and engaging.

The agent uses advanced language models to craft scenarios that are approximately 500 words in length, providing sufficient detail to cover the complexities and nuances of real-world situations. These scenarios are tailored to highlight specific organizational challenges, data points, and objectives, making them highly applicable to the learners' future professional environments.

Key features of the Scenario Generation Agent include:
- **Alignment with Learning Outcomes**: Ensures that the generated scenario directly supports the specified learning outcomes, helping learners achieve the desired educational objectives.
- **Relevance to Course Title**: Incorporates elements related to the course title, ensuring that the scenario is contextually appropriate and enhances the overall learning experience.
- **TSC Proficiency Level**: Adjusts the complexity and depth of the scenario based on the specified TSC proficiency level, catering to the learners' current skill set and knowledge base.
- **Realistic and Practical**: Focuses on creating scenarios that are realistic and practical, providing learners with opportunities to apply their skills in situations that mirror real-world challenges.
- **Detailed and Comprehensive**: Generates scenarios that are 1-2 paragraphs long, offering a thorough exploration of the context and challenges, and providing ample material for learners to engage with.

By integrating these features, the Scenario Generation Agent ensures that learners are well-prepared to tackle real-world problems, enhancing their ability to apply theoretical knowledge in practical settings.

In [None]:
# System prompt tailored for content retrieval and question-answer generation
system_prompt = """\
You are an instructional design assistant tasked with generating concise, realistic, and practical scenario-based question-answer pairs for educational purposes.

Your role:
1. **Generate a real-world scenario** for the given Course Title and Learning Outcome (LO). The scenario must:
   - Be concise (1-2 paragraphs) while clearly describing the organizational challenges or context.
   - Align directly with the Learning Outcome and be applicable to the associated abilities.
   - Highlight specific organizational data, challenges, and objectives to ensure relevance and practicality.

2. Use only the information relevant to the specified Learning Unit, Learning Outcome, and its abilities. Do not include information from unrelated topics.

3. Ensure that:
   - Each scenario and question-answer pair is realistic, aligned to Bloom's Taxonomy level for the LO, and practically applicable.
   - If no relevant content exists, create a general scenario that remains educationally valuable and tied to the broader course theme.

**Output Requirements:**
- The scenario should be 1-2 paragraphs, free of markdown elements or formatting.

**Restrictions:**
- Do not include content from other topics or unrelated slides.
- Do not invent abilities or knowledge outside the scope of the LO and its associated abilities.
"""

scenario_llm = OpenAI(model="gpt-4o-mini", system_prompt=system_prompt)

### Question and Answer Generation Agent

The Question and Answer Generation Agent is designed to facilitate the creation of educational content by generating scenario-based questions and answers. This agent operates by taking in a specified Learning Outcome (LO) and its associated abilities, retrieving relevant content, and then generating questions and answers that align with the given scenario and Bloom's Taxonomy level.

#### Key Features:
1. **Input Handling**:
    - **Learning Outcome (LO)**: The specific educational goal that the learners are expected to achieve.
    - **Associated Abilities**: The skills or competencies that are linked to the Learning Outcome.

2. **Content Retrieval**:
    - The agent retrieves content that is relevant to the specified Learning Outcome and its associated abilities. This ensures that the generated questions and answers are contextually appropriate and aligned with the educational objectives.

3. **Scenario Alignment**:
    - The agent generates a realistic and practical scenario that aligns with the Learning Outcome. This scenario provides a context for the questions and answers, making them more engaging and applicable to real-world situations.

4. **Question and Answer Generation**:
    - Based on the retrieved content and the generated scenario, the agent formulates questions that require learners to demonstrate their understanding and mastery of the Learning Outcome and associated abilities.
    - The answers are crafted to align with Bloom's Taxonomy level, ensuring that they meet the desired cognitive complexity and educational standards.

5. **Output**:
    - The final output includes the scenario, the question, and the answer, all structured in a clear and concise format. This output can be used directly in educational assessments or as part of instructional materials.

By integrating these features, the Question and Answer Generation Agent ensures that the generated educational content is both relevant and effective in achieving the specified learning objectives.

In [None]:
# System prompt tailored for content retrieval and question-answer generation
system_prompt = """\
You are a content retrieval assistant tasked with generating a scenario-based question-answer pair for educational purposes.

Your role:
1. Generate a common **real-world scenario** for the given Learning Unit (LU) and its Learning Outcome (LO). The scenario should align with the Learning Outcome and be applicable to the associated abilities.
2. Based on the scenario:
   - Generate a question that requires learners to demonstrate the abilities associated with the LO.
   - Provide a detailed answer to the question that aligns with the Learning Outcome and demonstrates mastery of the abilities.
3. Use only the information relevant to the specified Learning Unit, Learning Outcome, and its abilities.
4. Identify and extract the exact inline segments from the provided documents that directly correspond to the content used to generate the question and answer.

Ensure that:
- Each segment is an exact match to a part of the document.
- The relevance of each segment to the question and answer is clear.
- If no relevant information is found for the LO or abilities, generate a general scenario and related question-answer pair.

Restrictions:
- Do not include content from other topics or unrelated slides.
- Do not invent abilities or knowledge outside the scope of the LO and its associated abilities."""

llm = OpenAI(model="gpt-4o-mini", system_prompt=system_prompt)

In [80]:
query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=scenario_llm,
    response_mode="compact",
)

In [81]:
tsc_proficiency = extracted_data.tsc_proficiency_level
print(tsc_proficiency)

3


In [None]:
from typing import List
from pydantic.v1 import BaseModel, Field

# Generate a detailed scenario for the case study
def generate_case_study_scenario(course_title: str, bloom_taxonomy_level: str, learning_outcomes: List[str]) -> str:
    """
    Generates a concise, realistic scenario for the case study.
    Args:
        course_title (str): The title of the course.
        learning_outcomes (List[str]): A list of learning outcomes.

    Returns:
        str: A concise scenario for the case study.
    """
    outcomes_text = "\n".join([f"- {lo}" for lo in learning_outcomes])
    prompt = (
        f"You are tasked with designing a concise, realistic scenario for the course '{course_title}'. "
        f"The scenario should align with these learning outcomes:\n{outcomes_text}\n and the Bloom's Taxonomy Level: \n{bloom_taxonomy_level}\n. "
        "The scenario should describe a company or organization facing challenges related to communication, collaboration, or customer satisfaction. "
        "Ensure the scenario is realistic and practical, and keep it to 1-2 paragraphs without markdown elements or formatting."
    )
    response = query_engine.query(prompt)
    return response.response.strip()

# Inputs
course_title = "Mastering the Art of Communication to Enhance Team Collaboration and Customer Satisfaction"

# Dynamic list of learning outcomes
learning_outcomes = [
    {
        "id": "LO1",
        "text": "Conduct research and evaluate best practices in workplace communications with suitable communication tools and methods.",
        "ability": {"id": "A1", "text": "Conduct research on best practices in workplace communications, evaluate their suitability for adoption and establish benchmarks for the organisation"}
    },
    {
        "id": "LO2",
        "text": "Evaluate internal and external trends in human capital statistics, organisational culture and the impact on employee engagement.",
        "ability": {"id": "A2", "text": "Evaluate internal and external trends and human capital statistics that may have an impact on employee engagement"}
    },
    {
        "id": "LO3",
        "text": "Develop a communication plan incorporating strategies and coaching methods for effective staff communication.",
        "ability": {"id": "A3", "text": "Develop communications plan to implement communication strategies and mechanisms"}
    }
]

# Extract just the learning outcome text for scenario generation
learning_outcome_texts = [outcome["text"] for outcome in learning_outcomes]

# Generate the shared scenario
scenario = generate_case_study_scenario(course_title, tsc_proficiency,learning_outcome_texts)

In [76]:
print(scenario)

A mid-sized technology firm, Tech Innovations Inc., has been experiencing a decline in employee engagement and customer satisfaction over the past year. The company has recently undergone significant changes, including a shift to remote work and the introduction of new project management software. Employees have reported feeling disconnected from their teams, leading to misunderstandings and delays in project timelines. Additionally, customer feedback indicates that communication regarding product updates and support has been inconsistent, resulting in frustration among clients. The management team recognizes the need to address these communication challenges to foster a more collaborative work environment and improve customer relations.

To tackle these issues, the leadership at Tech Innovations Inc. has decided to conduct thorough research into best practices for workplace communication. They aim to evaluate various communication tools and methods that can enhance both internal colla

In [83]:
display(Markdown(scenario))

In a mid-sized tech company, Innovatech Solutions, the management has noticed a significant decline in employee engagement and customer satisfaction over the past year. Internal surveys reveal that employees feel disconnected from their teams and unclear about their roles, leading to a lack of collaboration on projects. Additionally, customer feedback indicates that communication regarding product updates and support has been inconsistent, resulting in frustration and lost business opportunities. The leadership team recognizes that these issues stem from ineffective communication practices and a lack of a cohesive organizational culture. They decide to address these challenges by conducting thorough research on best practices in workplace communication, evaluating the current tools and methods in use, and identifying gaps in their approach.

To tackle these issues, the management forms a task force to analyze internal and external trends in human capital statistics, focusing on how these trends affect employee engagement and overall organizational culture. They aim to understand the impact of remote work on team dynamics and explore how communication tools can be leveraged to foster a more inclusive and collaborative environment. The task force will also develop a comprehensive communication plan that incorporates strategies for effective staff communication, including coaching methods to enhance interpersonal skills among employees. By implementing these changes, Innovatech Solutions hopes to create a more engaged workforce and improve customer satisfaction, ultimately driving better business outcomes.

In [None]:
# Inputs
course_title = "Mastering the Art of Communication to Enhance Team Collaboration and Customer Satisfaction"

# Dynamic list of learning outcomes
learning_outcomes = [
    {
        "id": "LO1",
        "text": "Conduct research and evaluate best practices in workplace communications with suitable communication tools and methods.",
        "ability": {"id": "A1", "text": "Conduct research on best practices in workplace communications, evaluate their suitability for adoption and establish benchmarks for the organisation"}
    },
    {
        "id": "LO2",
        "text": "Evaluate internal and external trends in human capital statistics, organisational culture and the impact on employee engagement.",
        "ability": {"id": "A2", "text": "Evaluate internal and external trends and human capital statistics that may have an impact on employee engagement"}
    },
    {
        "id": "LO3",
        "text": "Develop a communication plan incorporating strategies and coaching methods for effective staff communication.",
        "ability": {"id": "A3", "text": "Develop communications plan to implement communication strategies and mechanisms"}
    }
]

# Extract just the learning outcome text for scenario generation
learning_outcome_texts = [outcome["text"] for outcome in learning_outcomes]

# Generate the shared scenario
scenario = generate_case_study_scenario(course_title, learning_outcome_texts)

# Generate case study questions
questions = []

for outcome in learning_outcomes:
    lo_text = outcome["text"]
    ability = outcome["ability"]
    print(f"\n--- Generating question for Learning Outcome: {lo_text}")

    response = query_engine.query(
        f"Using the shared scenario, generate a practical case study question and answer aligned with the following:\n"
        f"- Learning Outcome: {lo_text}\n"
        f"- Ability: {ability['id']} - {ability['text']}\n"
        f"Ensure the question aligns with Bloom's Level: {tsc_proficiency}. Provide the question, the detailed task description, and the answer."
    )
    
    retrieved_content = response.response
    
    # Parse the response into question and answer
    try:
        question, answer = retrieved_content.split("\n\n", 1)
        cs_question = CaseStudyQuestion(
            question=question.strip(),
            answer=answer.strip(),
            abilities=[ability]
        )
        questions.append(cs_question)
    except Exception as e:
        print(f"Error parsing question-answer pair for ability {ability['id']}: {e}")

# Combine the scenario and questions into a case study
case_study = CaseStudy(
    scenario=scenario,
    questions=questions
)

# Print structured data for verification
print(case_study.json(indent=4))

## Generating Short Answer Questions (SAQ)

In this section, we generate SAQs for each Knowledge Statement using OpenAI. The generated questions include a scenario, a question, and an answer, which are structured in a JSON format. The process involves querying the OpenAI model with the relevant content retrieved for each Knowledge Statement. The results are then saved to a file for further use.

In [None]:
from llama_index.llms.openai import OpenAI
from typing import List

# System prompt tailored for content retrieval
system_prompt = """\
You are a content retrieval assistant tasked with retrieving educational topic content aligned with a given Knowledge Statement.

Your role:
1. Restrict your retrieval strictly to the specified topic provided in the query.
2. Retrieve and summarize the topic content that aligns with the provided Knowledge Statement.
3. If no specific content directly aligns with the Knowledge Statement, provide a general summary of the specified topic instead.
4. Identify and extract the exact inline segments from the provided documents that directly correspond to the content used to 
generate the given answer. The extracted segments must be verbatim snippets from the documents, ensuring a word-for-word match with the text 
in the provided documents.

Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't used the specific document don't mention it.
- If no relevant information is found for the Knowledge Statement, clearly state this and provide a general topic summary instead.

Restrictions:
- Do not include content from other topics or slides outside the specified topic.
- Each retrieved segment must explicitly belong to the given topic.
"""

llm = OpenAI(model="gpt-4o-mini", system_prompt=system_prompt)
# sllm = llm.as_structured_llm(output_cls=ReportOutput)

In [18]:
query_engine = index.as_query_engine(
    similarity_top_k=10,
    llm=llm,
    # response_mode="tree_summarize"
    response_mode="compact",
)

In [20]:
# Initialize a list to store structured outputs
retrieved_data: List[KnowledgeStatementContent] = []

# Iterate through all learning units and topics to query for all Knowledge Statements
for learning_unit in extracted_data.learning_units:
    for topic in learning_unit.topics:
        for knowledge in topic.tsc_knowledges:
            knowledge_id = knowledge.id
            knowledge_statement = knowledge.text
            topic_name = topic.name

            print(f"\n--- Retrieving for Knowledge Statement: {knowledge_id}: {knowledge_statement}")
            
            # Query the index to retrieve topic content for this Knowledge Statement
            response = query_engine.query(
                f"Generate questions and answer content for the following topic: '{topic.name}' aligning to the Knowledge Statement: '{knowledge_statement}'."
            )
            retrieved_content = response.response

            # Add the structured data using Pydantic model
            try:
                knowledge_data = KnowledgeStatementContent(
                    knowledge_id=knowledge_id,
                    knowledge_statement=knowledge_statement,
                    topic_name=topic_name,
                    retrieved_content=retrieved_content
                )
                retrieved_data.append(knowledge_data)
            except Exception as e:
                print(f"Error adding structured data for {knowledge_id}: {e}")

# Print structured data for verification
for entry in retrieved_data:
    print(entry.json(indent=4))


--- Retrieving for Knowledge Statement: K1: Range of AI applications

--- Retrieving for Knowledge Statement: K6: Applicability of AI in the industry

--- Retrieving for Knowledge Statement: K4: Algorithm design and implementation

--- Retrieving for Knowledge Statement: K5: Methods of evaluating process improvements to the engineering processes using AI

--- Retrieving for Knowledge Statement: K3: Methods of evaluating effectiveness of AI applications

--- Retrieving for Knowledge Statement: K2: Concepts pertaining to performance effectiveness and analysis
{
    "knowledge_id": "K1",
    "knowledge_statement": "Range of AI applications",
    "topic_name": "Overview of Large Language Model (LLM)",
    "retrieved_content": "### Questions and Answers for 'Overview of Large Language Model (LLM)' Aligned to the Knowledge Statement: 'Range of AI Applications'\n\n**Question 1:** What is a Large Language Model (LLM)?\n\n**Answer:** A Large Language Model (LLM) is a type of artificial intelli

In [None]:
from pydantic import BaseModel, Field
from pydantic import BaseModel
from openai import OpenAI
from docx import Document
from typing import Dict, List
from llama_parse import LlamaParse
import nest_asyncio
import os
import json
OPENAI_API_KEY = os.getenv("TERTIARY_INFOTECH_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

# Function to generate WSQ using OpenAI
def generate_wsq_openai(knowledge_id: str, knowledge_statement: str, topic_name: str, retrieved_content: str) -> WSQ:
    """
    Generate a Workplace Scenario Question (WSQ) using OpenAI.

    Args:
        knowledge_id (str): The ID of the Knowledge Statement.
        knowledge_statement (str): The Knowledge Statement.
        topic_name (str): The name of the topic.
        retrieved_content (str): The content retrieved for the Knowledge Statement.

    Returns:
        WSQ: A structured WSQ object.
    """
    messages = [
        {
            "role": "system",
            "content": "You are an experienced instructional designer tasked with creating Workplace Scenario Questions (WSQs). Your output must include a Knowledge Statement, Scenario, Question, and Answer in JSON format."
        },
        {
            "role": "user",
            "content": (
                f"Generate a WSQ for the following inputs:\n\n"
                f"Knowledge ID: {knowledge_id}\n"
                f"Knowledge Statement: {knowledge_statement}\n"
                f"Topic Name: {topic_name}\n"
                f"Retrieved Content: {retrieved_content}\n\n"
                f"Ensure the response adheres to this schema: {{'knowledge_id': 'string', 'knowledge_statement': 'string', 'scenario': 'string', 'question': 'string', 'answer': 'string'}}."
            )
        }
    ]

    # Call OpenAI's structured response API
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=messages,
        response_format=WSQ
    )

    try:
        wsq_data = response.choices[0].message.parsed
        return wsq_data
    except Exception as e:
        print(f"Error generating WSQ: {e}")
        return None


In [22]:
# Generate WSQs for each Knowledge Statement in retrieved_data
wsq_results = []

for entry in retrieved_data:
    # Access attributes of the Pydantic model
    knowledge_id = entry.knowledge_id
    knowledge_statement = entry.knowledge_statement
    topic_name = entry.topic_name
    retrieved_content = entry.retrieved_content

    # Generate WSQ
    wsq = generate_wsq_openai(knowledge_id, knowledge_statement, topic_name, retrieved_content)
    if wsq:
        print(f"\n--- WSQ for {knowledge_id}: {knowledge_statement} ---\n")
        print(f"Scenario: {wsq.scenario}\n")
        print(f"Question: {wsq.question}\n")
        print(f"Answer: {wsq.answer}\n")
        wsq_results.append(wsq)

# Save WSQs to a file for further use
with open("wsq_results.json", "w") as f:
    json.dump([wsq.dict() for wsq in wsq_results], f, indent=4)


--- WSQ for K1: Range of AI applications ---

Scenario: As a business analyst, you are tasked with presenting a report on the potential applications of Large Language Models (LLMs) to your team. You have identified several areas where LLMs can be beneficial, such as customer support, content creation, and educational tools.

Question: What are some common applications of Large Language Models (LLMs) that you could include in your report to the team?

Answer: Common applications of LLMs include content creation and assistance, customer support and chatbots, language translation and localization, educational tools, business intelligence and analytics, accessibility for disabled persons, coding and development, legal and compliance assistance, healthcare support, art and design inspiration, enhanced search engines, and crisis management and response.


--- WSQ for K6: Applicability of AI in the industry ---

Scenario: A manager at a healthcare organization is exploring new technologies t

In [23]:
print(wsq_results)

[WSQ(knowledge_id='K1', knowledge_statement='Range of AI applications', scenario='As a business analyst, you are tasked with presenting a report on the potential applications of Large Language Models (LLMs) to your team. You have identified several areas where LLMs can be beneficial, such as customer support, content creation, and educational tools.', question='What are some common applications of Large Language Models (LLMs) that you could include in your report to the team?', answer='Common applications of LLMs include content creation and assistance, customer support and chatbots, language translation and localization, educational tools, business intelligence and analytics, accessibility for disabled persons, coding and development, legal and compliance assistance, healthcare support, art and design inspiration, enhanced search engines, and crisis management and response.'), WSQ(knowledge_id='K6', knowledge_statement='Applicability of AI in the industry', scenario='A manager at a he

In [28]:
for wsq in wsq_results:
    print(wsq.dict())

{'knowledge_id': 'K1', 'knowledge_statement': 'Range of AI applications', 'scenario': 'As a business analyst, you are tasked with presenting a report on the potential applications of Large Language Models (LLMs) to your team. You have identified several areas where LLMs can be beneficial, such as customer support, content creation, and educational tools.', 'question': 'What are some common applications of Large Language Models (LLMs) that you could include in your report to the team?', 'answer': 'Common applications of LLMs include content creation and assistance, customer support and chatbots, language translation and localization, educational tools, business intelligence and analytics, accessibility for disabled persons, coding and development, legal and compliance assistance, healthcare support, art and design inspiration, enhanced search engines, and crisis management and response.'}
{'knowledge_id': 'K6', 'knowledge_statement': 'Applicability of AI in the industry', 'scenario': 'A

In [None]:
from docxtpl import DocxTemplate
import os

def generate_documents(context: dict, type: int, output_dir: str) -> dict:
    """
    Generate the question paper and answer paper for the given context and type.

    Parameters:
    - context (dict): The data for the assessment (course title, type, questions, etc.).
    - type (int): The assessment type (1 for Ability-based, 2 for Knowledge-based).
    - output_dir (str): Directory where the generated documents will be saved.

    Returns:
    - dict: Paths to the generated documents (question and answer papers).
    """
    # Define template paths
    TEMPLATES = {
        1: { # PP
            "ANSWER": r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\Templates\(Template) Answer to A Assessment - Course Title - v1.docx",
            "QUESTION": r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\Templates\(Template) A Assessment - Course Title - v1.docx"
        },
        2: { # SAQ
            "ANSWER": r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\Templates\(Template) Answer to WA (SAQ) - Course Title - v1.docx",
            "QUESTION": r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\Templates\(Template) WA (SAQ) - Course Title - v1.docx"
        },
        3: { # CS
            "ANSWER": r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\Templates\(Template) Answers to CS - Course Title - v1.docx",
            "QUESTION": r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\Templates\(Template) CS - Course Title - v1.docx"
        },
        # 4: { # OQ
        #     "ANSWER": None,
        #     "QUESTION": r"C:\Users\dljh1\Documents\courseware_autogen\Assessment\Templates\(Template) WA (SAQ) - Course Title - v1.docx"
        # },
    }

    # Validate type
    if type not in TEMPLATES:
        raise ValueError("Invalid type. Must be 1 (Ability-based) or 2 (Knowledge-based).")

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Load templates
    answer_template_path = TEMPLATES[type]["ANSWER"]
    question_template_path = TEMPLATES[type]["QUESTION"]
    answer_doc = DocxTemplate(answer_template_path)
    question_doc = DocxTemplate(question_template_path)

    # Prepare context for the question paper by creating a copy of the context without answers
    question_context = {
        **context,
        "questions": [
            {
                **question,
                "answer": None,  # Remove answers for the question document
            }
            for question in context.get("questions", [])
        ]
    }

    # Render both templates
    answer_doc.render(context)  # Render with answers
    question_doc.render(question_context)  # Render without answers

    # Save the documents to the output directory
    files = {
        "ANSWER": os.path.join(output_dir, f"Answers to WA(SAQ) - {context['course_title']} - v1.docx"),
        "QUESTION": os.path.join(output_dir, f"WA(SAQ) - {context['course_title']} - v1.docx")
    }
    answer_doc.save(files["ANSWER"])
    question_doc.save(files["QUESTION"])

    return files  # Return paths to the generated documents

In [None]:
for assessment in extracted_data.assessments:
    if 'SAQ' in assessment.code:
        context = {
            "course_title": extracted_data.course_title,  # e.g., extracted_data.course_title
            "duration": assessment.duration,
            # The scenario field at the top level can be a general overview or introduction.
            # Since each WSQ also has its own scenario, you can choose to put a general scenario here:
            "scenario": "Below are scenario-based questions derived from the course content and Knowledge Statements.",
            "questions": []
        }

        # Populate the questions from wsq_results
        for wsq in wsq_results:
            # Append the WSQ details along with the knowledge ID to the context
            context["questions"].append({
                "knowledge_id": wsq.knowledge_id,
                "scenario": wsq.scenario,
                "question_statement": wsq.question,
                "answer": wsq.answer
            })

        print("Context prepared for document generation.")

        files = generate_documents(context, type=2, output_dir="output")  # 'type=2' as per your code for knowledge-based assessment
        print("Documents generated:", files)

Context prepared for document generation.
Documents generated: {'ANSWER': 'output\\Answers to WA(SAQ) - Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini - v1.docx', 'QUESTION': 'output\\WA(SAQ) - Develop Artificial Intelligence and Large Language Model (LLM) Applications with Google Gemini - v1.docx'}
