In [1]:
!pip install langchain  langchain-community pypdf pymupdf langchain_google_genai unstructured python-pptx crewai faiss-cpu

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting pypdf
  Downloading pypdf-5.7.0-py3-none-any.whl.metadata (7.2 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.7-py3-none-any.whl.metadata (7.0 kB)
Collecting unstructured
  Downloading unstructured-0.18.5-py3-none-any.whl.metadata (24 kB)
Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting crewai
  Downloading crewai-0.141.0-py3-none-any.whl.metadata (35 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic

In [11]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.4.2-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.2-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.0 MB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m1.6/2.0 MB[0m [31m22.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m25.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.2


In [1]:
# prompt: connect with drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from crewai import Agent, LLM
import os
from dotenv import load_dotenv
load_dotenv()

llm = LLM(
    model="gemini/gemini-2.0-flash",
    api_key="AIzaSyATVonQ3KEbyC6tRt-UF1d6g18HPwcjXAk",
    temperature=0.3,
)

In [3]:
import os
from langchain.document_loaders import (
    PyMuPDFLoader,
    TextLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredPowerPointLoader,
)

def load_documents(directory_path):
    """Load all PDF, TXT, DOCX, and PPTX documents from a directory"""
    documents = []
    titles = []

    for filename in os.listdir(directory_path):
        path = os.path.join(directory_path, filename)
        ext = os.path.splitext(filename)[1].lower()

        # Select appropriate loader
        if ext == ".pdf":
            loader = PyMuPDFLoader(path)
        elif ext == ".txt":
            loader = TextLoader(path, encoding="utf-8")
        elif ext == ".docx":
            loader = UnstructuredWordDocumentLoader(path)
        elif ext == ".pptx":
            loader = UnstructuredPowerPointLoader(path)
        else:
            print(f"Skipping unsupported file: {filename}")
            continue

        try:
            # Load and enrich documents
            docs = loader.load()
            title = os.path.splitext(filename)[0]
            titles.append(title)

            for doc in docs:
                doc.metadata["title"] = title
                doc.metadata["source_file"] = filename

            documents.extend(docs)
            print(f"Loaded document: {filename}")
        except Exception as e:
            print(f"Failed to load {filename}: {e}")

    print(f"\n✅ Total documents loaded: {len(documents)}")
    return documents, titles


In [4]:
from langchain.schema import Document
def split_by_chunk_size(documents, chunk_size=1000):

    all_chunks = []

    doc_groups = {}
    for doc in documents:
        doc_id = doc.metadata.get("title") or doc.metadata.get("source_file") or "Unknown_Document"
        doc_groups.setdefault(doc_id, []).append(doc)

    # Process each grouped document
    for doc_id, docs in doc_groups.items():
        full_text = " ".join([doc.page_content for doc in docs])
        source_file = docs[0].metadata.get("source_file", doc_id)

        num_chunks = (len(full_text) + chunk_size - 1) // chunk_size

        for i in range(0, len(full_text), chunk_size):
            chunk_text = full_text[i:i + chunk_size]
            chunk_index = i // chunk_size

            chunk = Document(
                page_content=chunk_text,
                metadata={
                    "title": doc_id,
                    "source_file": source_file,
                    "chunk_index": chunk_index,
                    "total_chunks": num_chunks
                }
            )
            all_chunks.append(chunk)

        print(f"✅ Split document '{doc_id}' into {num_chunks} chunks")

    print(f"\n📄 Total chunks created: {len(all_chunks)}")
    return all_chunks


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

def create_vector_store(chunks):
    """
    Create a vector store from document chunks
    """
    print("Creating vector store...")
    # Using HuggingFace embeddings (open-source alternative)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    # Create the vector store
    vector_store = FAISS.from_documents(chunks, embeddings)
    print("Vector store created successfully")
    return vector_store

In [6]:
vector_database={}


In [7]:
course_name="classical machine learning"
saved_name=course_name+"vector"
def get_vector_store(folder_path,found,course_name):
    if found==False:
        documents, candidate_names = load_documents(folder_path)
        chunks=split_by_chunk_size(documents)
        vector_store = create_vector_store(chunks)
        vector_store.save_local(saved_name)
        vector_database[course_name]=saved_name
    else:
        vector_store = FAISS.load_local(saved_name, HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),allow_dangerous_deserialization=True)
    return vector_store


In [8]:
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from pptx.enum.text import PP_PARAGRAPH_ALIGNMENT
from crewai.tools import tool
import re

@tool
def generate_pptx_from_json(json_data: dict) -> str:
    """
    Generate PowerPoint from structured JSON content, including:
    - Agenda slides with overflow handling
    - Content slides with bullet splitting
    - Code block formatting
    - Spacing, coloring, layout consistency
    """

    # Design & Layout
    primary_color = RGBColor(139, 0, 0)        # Dark red for headers & agenda
    dark_text = RGBColor(30, 30, 30)           # General text
    code_bg = RGBColor(240, 240, 240)          # Code block background
    white_bg = RGBColor(255, 255, 255)         # Background

    font_name = "Calibri"
    code_font_name = "Consolas"
    title_font_size = Pt(36)
    header_font_size = Pt(28)
    content_font_size = Pt(20)
    code_font_size = Pt(16)
    max_bullets_per_slide = 6

    prs = Presentation()

    # Title Slide
    title_slide = prs.slides.add_slide(prs.slide_layouts[0])
    title = title_slide.shapes.title
    title.text = json_data.get("course_title", "Course Title")
    title.text_frame.paragraphs[0].font.size = title_font_size
    title.text_frame.paragraphs[0].font.bold = True
    title.text_frame.paragraphs[0].font.color.rgb = primary_color

    def split_into_bullets(text_or_list):
        """Handles both strings and lists."""
        if isinstance(text_or_list, list):
            return text_or_list
        lines = [p.strip() for p in str(text_or_list).split('\n') if p.strip()]
        bullets = []
        for line in lines:
            if len(line.split()) > 15 or any(x in line for x in [';', '•', '- ', '1.', '2.']):
                split_line = re.split(r'(?<=[.;]) |(?<=\d\.) |(?<=•) |(?<=- )', line)
                bullets.extend([s.strip() for s in split_line if s.strip()])
            else:
                bullets.append(line)
        return bullets

    def add_agenda_slide(agenda_items, part=1):
        """Adds one agenda slide with up to max_bullets_per_slide bullet items."""
        slide = prs.slides.add_slide(prs.slide_layouts[1])
        title = slide.shapes.title
        suffix = f" (Part {part})" if part > 1 else ""
        title.text = "Agenda" + suffix
        title.text_frame.paragraphs[0].font.size = header_font_size
        title.text_frame.paragraphs[0].font.color.rgb = primary_color
        title.text_frame.paragraphs[0].font.bold = True

        tf = slide.placeholders[1].text_frame
        tf.clear()

        for entry in agenda_items:
            p = tf.add_paragraph()
            p.text = entry['text']
            p.level = entry['level']
            p.font.name = font_name
            p.font.size = header_font_size if entry['level'] == 0 else content_font_size
            p.font.color.rgb = primary_color if entry['level'] == 0 else dark_text
            p.font.bold = entry['level'] == 0
            p.space_after = Pt(4)

    # Generate full agenda list
    full_agenda = []
    for topic in json_data.get("topics", []):
        full_agenda.append({'text': topic.get("topic_title", "Untitled Topic"), 'level': 0})
        for session in topic.get("sessions", []):
            full_agenda.append({'text': session.get("session_title", "Untitled Session"), 'level': 1})

    # Create paginated agenda slides
    i = 0
    part = 1
    while i < len(full_agenda):
        add_agenda_slide(full_agenda[i:i + max_bullets_per_slide], part)
        i += max_bullets_per_slide
        part += 1

    def add_content_slide(session_title, bullets, examples, code, part=1):
        """Adds content slide with header and up to max_bullets_per_slide bullets."""
        slide = prs.slides.add_slide(prs.slide_layouts[1])
        slide.background.fill.solid()
        slide.background.fill.fore_color.rgb = white_bg

        # Title
        title = slide.shapes.title
        suffix = f" (Part {part})" if part > 1 else ""
        title.text = session_title + suffix
        title.text_frame.paragraphs[0].font.size = header_font_size
        title.text_frame.paragraphs[0].font.color.rgb = primary_color
        title.text_frame.paragraphs[0].font.bold = True

        # Bullet content
        content = slide.placeholders[1]
        tf = content.text_frame
        tf.clear()

        count = 0
        for point in bullets[:max_bullets_per_slide]:
            p = tf.add_paragraph()
            p.text = point
            p.level = 0
            p.font.name = font_name
            p.font.size = content_font_size
            p.font.color.rgb = dark_text
            p.space_after = Pt(6)
            count += 1

        # Examples
        for ex in examples:
            for ex_point in split_into_bullets(ex):
                if count >= max_bullets_per_slide:
                    break
                p = tf.add_paragraph()
                p.text = ex_point
                p.level = 0
                p.font.name = font_name
                p.font.size = content_font_size
                p.font.italic = True
                p.font.color.rgb = dark_text
                p.space_after = Pt(6)
                count += 1

        # Code block
        if code and part == 1:
            code_lines = code.strip().split('\n')
            if len(code_lines) > 0:
                txBox = slide.shapes.add_textbox(Inches(1), Inches(5.3), Inches(8), Inches(1.2))
                tf_code = txBox.text_frame
                tf_code.clear()
                p_code = tf_code.add_paragraph()
                p_code.text = code.strip()
                p_code.font.name = code_font_name
                p_code.font.size = code_font_size
                p_code.font.color.rgb = dark_text
                txBox.fill.solid()
                txBox.fill.fore_color.rgb = code_bg

        return bullets[max_bullets_per_slide:]

    # Content Slides
    for topic in json_data.get("topics", []):
        for session in topic.get("sessions", []):
            bullets = split_into_bullets(session.get("content", []))
            examples = session.get("examples", []) if isinstance(session.get("examples", []), list) else []
            code = session.get("code", "")
            part = 1
            while bullets:
                bullets = add_content_slide(session.get("session_title", "Untitled Session"), bullets, examples if part == 1 else [], code if part == 1 else "", part)
                part += 1

    prs.save("course_presentation.pptx")
    return "✅ PowerPoint saved as 'course_presentation.pptx' with multi-page agenda and structured slides."


In [79]:
from crewai import Agent, LLM
from pptx import Presentation
from pptx.util import Inches, Pt
class Project_agents:
    def __init__(self):
        """Initialize with a selected LLM provider and model"""
        self.llm = llm

    def context_retriever_agent(self):
        return Agent(
            role="Document Search Assistant",
            goal="Find and return relevant text chunks for each topic from a document store.",
            backstory=(
                "You specialize in retrieving relevant information for educational purposes. "
                "When given a topic, you search a document database and return the most relevant text chunks "
                "to support curriculum generation and slide creation."
            ),
            verbose=True,
            allow_delegation=False,
            llm=self.llm
    )

    def summary_agent(self):
        """Initialize with a selected LLM provider and model"""
        return Agent(
            role="HTML Summary Generator",
            goal="Generate a clear and well-structured HTML summary from the input content.",
            backstory=(
                "You are an expert assistant specialized in transforming complex or lengthy educational content "
                "into organized, visually structured HTML summaries. You make the information easy to understand "
                "by dividing it into sections with headers, paragraphs, and bullet points."
            ),
            verbose=True,
            allow_delegation=False,
            llm=self.llm,
            instructions=f"""
You will receive a block of raw content. Your task is to:
- Carefully analyze and summarize the content
- Format the output as a valid HTML page

📄 Output Formatting Instructions:
- Use <h1> for the main title (if identifiable)
- Use <h2> for section headings like "Overview", "Key Points", "Examples", "Conclusion"
- Use <p> for paragraph content
- Use <ul><li> for lists or bullet points
- Ensure spacing and nesting are correct for proper rendering
- Do not include any explanatory text, only return clean HTML content

🧠 Content Understanding:
- Identify and highlight the main ideas
- Break down the explanation into short readable sections
- Include real-world examples or applications if mentioned


🎯 Return only a complete and well-structured HTML document (no Markdown, no comments).
"""
        )

    def json_agent(self,context):

      return Agent(
        role="Course Structuring and Summarization Agent",
        goal=(
            "Analyze the course content, summarize it accurately without missing key information, "
            "organize it into well-defined topics and subtopics, and output it in a structured JSON format "
            "ready for slide generation."
        ),
        backstory=(
            "You are an expert assistant trained to analyze and restructure educational content. "
            "Given course material, your job is to understand and summarize the content into key topics and subtopics. "
            "Each subtopic should include important points in a bullet-point format. "
            "The final structured output should be ready for use in slide creation."
        ),
        verbose=True,
        allow_delegation=False,
        llm=self.llm,
        instructions=(
            "You will be given course content. Based on this content:\n\n"
            "1. Understand the entire material deeply and identify all **key topics** or **chapters**.\n"
            "2. Under each topic, create **subtopics or sessions**, and summarize them clearly.\n"
            "3. For each session, extract the main concepts into **bullet points** to ensure clarity and coverage.\n\n"
            "💡 For example:\n\n"
            "{\n"
            "  \"course_title\": \"Introduction to Ensemble Learning\",\n"
            "  \"topics\": [\n"
            "    {\n"
            "      \"topic_title\": \"Ensemble Learning\",\n"
            "      \"sessions\": [\n"
            "        {\n"
            "          \"session_title\": \"Random Forest\",\n"
            "          \"content\": [\n"
            "            \"Random Forest is an ensemble of decision trees\",\n"
            "            \"It reduces variance and avoids overfitting\",\n"
            "            \"Uses bagging and feature randomness\"\n"
            "          ]\n"
            "        },\n"
            "        {\n"
            "          \"session_title\": \"Boosting\",\n"
            "          \"content\": [\n"
            "            \"Boosting focuses on errors made by previous models\",\n"
            "            \"Popular boosting algorithms include AdaBoost, Gradient Boosting, and XGBoost\",\n"
            "            \"It reduces bias and improves prediction accuracy\"\n"
            "          ]\n"
            "        }\n"
            "      ]\n"
            "    }\n"
            "  ]\n"
            "}\n\n"
            "✔️ Ensure:\n"
            "- The structure is clean and fully JSON-compatible.\n"
            "- No important concepts are missed.\n"
            "- Each session has bullet-point content for easier transformation into slides.\n"
            "- Keep explanations concise, clear, and factual.\n"
        ),
        memory=[{"context": context}]
    )

    def pptx_generation_agent(self):

      return Agent(
        role="Bullet Point PowerPoint Specialist",
        goal="Transform a structured JSON course file into a polished PowerPoint presentation with a full agenda and detailed, bulleted slides.",
        backstory=(
            "You are a presentation specialist converting structured JSON into PowerPoint slides. "
            "You include an agenda, break content into bullet points, and add brief introductory details to enrich each session."
        ),
        verbose=True,
        allow_delegation=False,
        llm=self.llm,
        instructions=(
            "INPUT:\n"
            "- A structured JSON file:\n"
            "{\n"
            "  \"course_title\": \"...\",\n"
            "  \"topics\": [\n"
            "    {\n"
            "      \"topic_title\": \"...\",\n"
            "      \"sessions\": [\n"
            "        {\n"
            "          \"session_title\": \"...\",\n"
            "          \"content\": [\n"
            "            \"bullet point 1\",\n"
            "            \"bullet point 2\",\n"
            "            ...\n"
            "          ]\n"
            "        }\n"
            "      ]\n"
            "    }\n"
            "  ]\n"
            "}\n\n"

            "AGENDA RULES:\n"
            "1. Create one or more slides titled 'Agenda' immediately after the title slide.\n"
            "2. Display ALL topic titles and session titles using nested bullets:\n"
            "   - Topic Title → Level 0, dark red, bold, 28pt\n"
            "   - Session Title → Level 1, gray, normal, 20pt\n"
            "3. If >6 items total, split agenda across multiple slides labeled 'Agenda (Part 2)', etc.\n"

            "SESSION SLIDES:\n"
            "For each session:\n"
            "1. Create a slide titled with the `session_title`\n"
            "2. Just below the title, add a **brief detailed summary** or explanatory paragraph (1–2 lines max), generated from the title and bullet themes.\n"
            "   - Format: italic, Calibri, 18pt, gray, max width of 2 lines\n"
            "3. Below the detail paragraph, add the bullet points from `content`\n"
            "   - Each bullet: ≤ 2 lines\n"
            "   - If >6 bullets, split into multiple slides (Session Title (Part 2), etc.)\n"
            "   - Indent sub-points where needed\n"
            "   - Italicize examples\n"
            "   - Bold technical terms\n"
            "   - Use larger font (22pt) for key concepts\n"

            "CODE RULES:\n"
            "If any bullet contains code (e.g. starts with `def`, `class`, `import`, or uses symbols like `()`, `{}`):\n"
            "- Extract it into a dedicated code block (gray box, monospaced font, preserved indentation)\n"
            "- Place the code block below the bullets (or move to next slide if space is limited)\n"
            "- Add line numbers if >5 lines\n"
            "- Keep code block ≤ 1/3 of slide height\n"

            "QUALITY CONTROL:\n"
            "- No bullets should wrap beyond 2 lines\n"
            "- No slide should have more than 6 main bullets\n"
            "- Code blocks and bullets should never overlap\n"
            "- Ensure detailed paragraph appears before bullets\n"

            "FINAL OUTPUT:\n"
            "- Export the file as 'course_presentation.pptx'\n"
            "- Agenda must be complete and fully paginated if needed\n"
            "- All session content must be present, formatted, and include a detail paragraph\n"
            "- Slides must be presentation-ready, clean, and consistent"
        ),
        tools=[generate_pptx_from_json]
    )

    def narrative_agent(self):
      return Agent(
        role="Narrative Creator",
        goal="Generate educational narrative text based on course context and input content.",
        backstory="You are specialized in creating clear, engaging educational narratives.",
        llm=llm,
        verbose=True
    )

    def customize_agent(self):
        return Agent(
          role="Narrative Customizer",
          goal="Revise the narrative text to match specified style, language, and length.",
          backstory="You are an expert in adapting educational content to different tones and styles.",
          llm=llm,
          verbose=True
      )





In [80]:
from langchain.prompts import PromptTemplate

narrative_prompt = PromptTemplate(
    input_variables=[
        "content",
        "presentation",
        "course_title",
        "course_desc",
        "audience_age",
        "teaching_style",
        "language"
    ],
    template="""
You are a highly skilled educational content narrator.

Your goal is to generate an engaging, easy-to-follow narrative that reflects the structure of a PowerPoint presentation and expands on the key points.

---

Course Title: {course_title}
Course Description: {course_desc}
Audience Age Group: {audience_age}
Teaching Style: {teaching_style}
Language: {language}

---

Presentation Slide Outline:
{presentation}

Raw Content for This Section:
{content}

---

🎯 Instructions:
Write a **structured educational narrative** that includes estimated **time durations** for each part. Format it so it can be used as a **video script**.

For each section, include:
1. **[Duration: ~X min]** at the beginning of the paragraph.
2. A clear and age-appropriate **introduction** to the topic.
3. A **step-by-step walkthrough** of each key point or slide element, reflecting the `presentation`.
4. A **real-world example or analogy** to make the topic relatable.
5. A **brief conclusion** connecting the lesson back to the overall course goal.

💡 Tone & Style:
- Match the teaching style: {teaching_style}
- Language: {language}
- Use age-appropriate vocabulary and structure based on the audience age: {audience_age}
- Ensure logical flow aligned with the slide sequence
- Avoid excessive technical jargon unless intended for advanced audiences

🕒 Example Output Structure:
[Duration: ~1 min] Introduction
[Duration: ~2 min] Slide 1 Explanation
[Duration: ~2 min] Slide 2 Explanation
[Duration: ~1 min] Real-World Analogy
[Duration: ~1 min] Conclusion

---

Return only the narrative in plain text, clearly structured with headings and time estimates.
"""
)

customize_prompt = PromptTemplate(
    input_variables=["narrative", "style", "language", "length"],
    template="""
You are an expert educational narrative editor and HTML formatter.

Here is the original narrative:
{narrative}

Revise it according to the following:
- Writing Style: {style}
- Language: {language}
- Desired Length: {length} (e.g., short summary, in-depth walkthrough)

🎯 Output Instructions:
- Format the final output in **clean, well-structured HTML**
- Use a **black-and-white color scheme** (purely grayscale)
- Use appropriate HTML tags:
  - <h1> for the main title
  - <h2> for section headers (e.g., "Introduction", "Key Concepts", "Real-World Example", "Conclusion")
  - Add a small estimated duration under each section header (e.g., <small>~2 minutes</small>)
  - <p> for paragraphs
  - <ul><li> for bullet points (if needed)
- Wrap the output in complete HTML including <html>, <head>, <body>
- Apply this CSS:
  - Body: white background, black text
  - Font: Helvetica, Arial, sans-serif
  - No borders, no background colors
  - Clean layout suitable for printing and web reading

🚫 Do NOT include:
- Any markdown
- Instructional text
- Extra commentary outside of HTML

📄 This content will be saved into a `narrative.html` file and viewed in a web browser.

Return only the complete HTML file, properly formatted and styled.
"""
)




In [81]:
from crewai import Task
from tqdm import tqdm
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

class ProjectTasks:

  def retrieve_combined_context_task(self, agent, course_name, course_description, found, folder_path,topic, k=30):
    """
    Retrieves context from the vector store relevant to a specific topic/lesson/chapter.

    Args:
        agent: The LLM agent to handle the task.
        course_name: The overall name of the course.
        course_description: A general description of the course.
        found: Data required to build the vector store.
        folder_path: Path to the vector store or documents.
        query: Specific topic, lesson, or chapter to retrieve context for.
        k: Number of most similar documents to retrieve.

    Returns:
        A Task object with context relevant to the specific query.
    """
    # Build the vector store
    vector_store = get_vector_store(folder_path, found, course_name)

    # Search based on specific topic/chapter/lesson
    relevant_docs = vector_store.similarity_search(topic, k=k)

    # Combine document content
    combined_context = "\n\n".join([doc.page_content for doc in relevant_docs])

    # Return a task with detailed instructions and context
    return Task(
        description=(
            f"You are given content from the course '{course_name}'.\n\n"
            f"Course Description: {course_description}\n\n"
            f"Your task is to focus specifically on the topic: '{topic}' and extract or summarize relevant information.\n\n"
            f"Context:\n{combined_context}"
        ),
        agent=agent,
        expected_output=f"A summary, explanation, or response focused on: '{topic}'."
    )


  def summary_task(self, agent, context):
        task = Task(
            agent=agent,
            description="Summarize the input content and return a well-formatted HTML page.",
            expected_output="Valid HTML with structured sections (<h1>, <h2>, <p>, <ul><li>) saved as summary.html",
            context=context
        )
  def json_task(self, agent, age, experience_level,context):

    return Task(
        description=(
            f"You are tasked with converting the provided course context into a structured JSON format, ready for presentation slides.\n\n"
            f"🎯 Target Audience:\n"
            f"- Age Group: {age}\n"
            f"- Experience Level: {experience_level}\n\n"
            f"📋 Instructions:\n"
            f"1. Analyze and understand the provided course material thoroughly.\n"
            f"2. Identify major **topics or chapters** based on the flow of the content.\n"
            f"3. Break down each topic into **sessions** (or subtopics).\n"
            f"4. Summarize the content of each session using **bullet points**.\n"
            f"5. Adapt the tone and depth of content to match the **target audience**:\n"
            f"   - Use engaging, simple, and example-rich language for younger or beginner users.\n"
            f"   - Use technical and detailed language for experienced or advanced learners.\n\n"
            f"6. Return the output in the following **strict JSON format**:\n\n"
            "{\n"
            "  \"course_title\": \"...\",\n"
            "  \"topics\": [\n"
            "    {\n"
            "      \"topic_title\": \"...\",\n"
            "      \"sessions\": [\n"
            "        {\n"
            "          \"session_title\": \"...\",\n"
            "          \"content\": [\n"
            "            \"Bullet point 1\",\n"
            "            \"Bullet point 2\",\n"
            "            \"...\"\n"
            "          ]\n"
            "        }\n"
            "      ]\n"
            "    }\n"
            "  ]\n"
            "}\n\n"
            f"⚠️ Ensure the JSON is clean, coherent, and free of missing or redundant information."
        ),
        agent=agent,
        expected_output=(
            "A JSON file that clearly organizes the course into topics and bullet-point sessions, "
            "tailored to the specified age and experience level. Ready to be used in slide generation."
        )
    )

  def pptx_task(self, agent, input_from):
    return Task(
        description=(
            "Take the JSON output from the previous step and generate a PowerPoint presentation. "
            "Include a title slide with the course name and audience info, then one slide per session with session titles and content."
        ),
        agent=agent,
        expected_output="A PowerPoint file named 'course_presentation.pptx'.",
        input_from=agent  # or explicitly input_from if your system requires
    )

  def narrative_task(self,content_task, course_name, course_description, presentation_task, age, teaching_style, language, narrative_agent, narrative_prompt):
    return Task(
        agent=narrative_agent,
        description=narrative_prompt.format(
            content=content_task.output,
            course_title=course_name,
            course_desc=course_description,
            presentation=presentation_task.output,
            audience_age=age,
            teaching_style=teaching_style,
            language=language
        ),
        expected_output="Narrative text explaining the topic clearly in English."
    )

  def customize_task(self,narrative_task, customize_agent, customize_prompt,teaching_style,language,length):
    return Task(
        agent=customize_agent,
        description=customize_prompt.format(
            narrative=narrative_task.output,
            style=teaching_style,
            language=language,
            length=length
        ),
        expected_output="Customized narrative text matching style and length and saved to 'narrative.html' ",
        output_file="narrative.html"
    )



In [82]:
folder_path = "/content/drive/MyDrive/Matrials"
course_name="classical machine learning"
course_description="An introduction to core machine learning algorithms like regression, classification, and clustering, focusing on their concepts and practical applications."
topic="ensemble learning"
age="20-22",
experience_level="advanced"
language="English"
teaching_style="Motivational storytelling"
length="Detailed explanation with example"
found=False
for course in vector_database.keys():
  if course==course_name:
    found=True

In [83]:
print(found)

True


In [84]:
from crewai import Crew, Process
import json

class Project_crew:
    def __init__(self):
        self.agents = Project_agents()
        self.tasks = ProjectTasks()

    def create_crew(self):
        # Define agents
        content_creator = self.agents.context_retriever_agent()
        summary_creator=self.agents.summary_agent()
        json_creator = self.agents.json_agent(content_creator)
        pptx_creator = self.agents.pptx_generation_agent()
        narrative_creator = self.agents.narrative_agent()
        customize_creator = self.agents.customize_agent()

        # Define tasks
        content_retriever_task = self.tasks.retrieve_combined_context_task(
            agent=content_creator,
            course_name=course_name,
            course_description=course_description,
            found=found,
            folder_path=folder_path,
            topic=topic
        )

        summary_task = self.tasks.summary_task(
            agent=summary_creator,
            context=content_retriever_task.output
        )

        json_task = self.tasks.json_task(
            agent=json_creator,
            age=age,
            experience_level=experience_level,
            context=content_retriever_task.output
        )

        pptx_task = self.tasks.pptx_task(
            agent=pptx_creator,
            input_from=json_task
        )

        narrative_task = self.tasks.narrative_task(content_retriever_task,course_name, course_description,
                                pptx_task, age, teaching_style, language, narrative_creator, narrative_prompt)

        customize_task = self.tasks.customize_task(
            narrative_task, customize_creator, customize_prompt,teaching_style,language,length)




        # Create crew
        return Crew(
            agents=[content_creator,json_creator, pptx_creator,
                narrative_creator, customize_creator],
          tasks=[content_retriever_task,json_task, pptx_task,
               narrative_task, customize_task],
          process=Process.sequential,
          verbose=False
        )

    def run(self):
        crew = self.create_crew()
        result = crew.kickoff()
        return result


In [85]:
crew=Project_crew()
result=crew.run()


In [25]:
final_narrative = result.tasks_output[-1]
with open("final_narrative.txt", "w", encoding="utf-8") as f:
    f.write(str(final_narrative))

In [None]:
result_dict = result.model_dump()

with open("crew_output.json", "w", encoding="utf-8") as f:
    json.dump(result_dict, f, indent=4, ensure_ascii=False)
