In [None]:
!pip install groq
!pip install python-dotenv==1.0.0


Collecting groq
  Downloading groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.11.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.11.0
Collecting python-dotenv==1.0.0
  Downloading python_dotenv-1.0.0-py3-none-any.whl.metadata (21 kB)
Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0


#Genearation of QA pairs for a chapter.
The 'DataProcessed' folder contain 'merged_html' file for each chapter separately. You need to upload both .env file and 'merged_html' file begore running this code. The output will be a set of json files, each contain QA pairs for each chunk and the context (chunk itself).

You can adjust the examples for desired and undesired questions in the prompt according to the specific subject.

In [None]:
from typing import List, Optional, Dict, Union
import json
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from groq import Groq
import os
from bs4 import BeautifulSoup  # Import BeautifulSoup


load_dotenv()

client = Groq(
    # This is the default and can be omitted
    api_key=os.environ.get("GROQ_API_KEY"),
)

groq = Groq()




# Data model for LLM to generate

class Question(BaseModel):
    question: str
    answer: Union[str, Dict]

class ChapterQA(BaseModel):
    questions: List[Question]


def generate_qa_from_chapter(script_content: str) -> ChapterQA:

    """
    Generate maximum number questions-answer pairs for students based on the provided script_content.
    """

    prompt = f"""
    You are an assistant who creates questions and corresponding answers for educational use based on the provided script content.
    Question complexity should be medium level.

    **Instructions:**
    1. **Exclude questions related to the examples, figures or diagrams.** Focus on textual content.
    2. **Generate maximum number of unambiguous, well-defined question-answer pairs for students based on the provided script_content.**
    Ensure each question clarifies potentially broad terms, specifies the scope of the topic, and indicates the level of detail expected.
    Avoid vague terms like ‘other’, ‘etc.' or 'equation number' and provide guidance within the question if multiple interpretations might arise.
    For example, specify particular concepts or terms if they need further elaboration to avoid confusion.
    3. **Ensure that answers are thoroughly address all concepts directly required by the question.**
    4. **For answers that include an equation, provide a brief description of each term in the equation.**

    **Examples of Desired Questions:**

    * "Explain the concept of angular momentum in the context of rotational motion."
    * "What are the key differences between center of mass and center of gravity?"
    * "Using the equation for torque, calculate the force required to rotate an object with a given moment of inertia."

    **Examples of Undesired Questions:**

    * "What are some things related to rotational motion?" (Too broad)
    * "Discuss the importance of this equation." (Unclear what "this" refers to)
    * "What else can we learn from this chapter?" (Vague and open-ended)

    Script Content: {script_content}

    Output the questions and answers in the following JSON format:
    json {{ "questions": [ {{ "question": "Question here", "answer": "Answer to the question" }},  # ... more questions
    ]}}
    """
    chat_completion = groq.chat.completions.create(
        messages=[
            {"role": "system", "content": prompt},
        ],
        model="llama3-70b-8192",  # You might need to adjust the model
        temperature=0,
        response_format={"type": "json_object"},
    )
    return ChapterQA.model_validate_json(chat_completion.choices[0].message.content)


def save_qa_to_json(chapter_qa: ChapterQA, filename: str, context: str):
    """Saves the chapter QA data to a JSON file."""
    chapter_qa_json = chapter_qa.model_dump()
    chapter_qa_json["context"] = context
    with open(filename, "w") as f:
        json.dump(chapter_qa_json, f, indent=4)


# Example usage:
with open("/content/merged_output.html", "r") as f:  # Replace with your HTML file
    html_content = f.read()

soup = BeautifulSoup(html_content, "html.parser")
script_content = ""
for script_tag in soup.select_one("script"):
    script_content += script_tag.string if script_tag.string else ""

chunk_size = 15000
overlap = 3000
chunks = []
for i in range(0, len(script_content), chunk_size - overlap):
  chunk = script_content[i:i + chunk_size]
  chunks.append(chunk)

max_index = len(chunks) * chunk_size

for i in range(len(chunks)):
  start_index = i * (chunk_size - overlap)
  end_index = start_index + chunk_size
  if end_index > max_index:
    end_index = max_index
  chunk = script_content[start_index:end_index]
  print(i)
  chapter_qa = generate_qa_from_chapter(chunk)
  save_qa_to_json(chapter_qa, 'chapter_qa'+str(i)+'.json', chunk)
  print('Chapter QA saved to chapter_qa'+str(i)+'.json')
  print(max_index-start_index)




0
Chapter QA saved to chapter_qa0.json
45000
1
Chapter QA saved to chapter_qa1.json
33000
2
Chapter QA saved to chapter_qa2.json
21000


In [None]:
len(chunks)

15

#Run Only When Errors Occur
This code can be used if certain chunks cause failures during JSON file generation, allowing you to skip the problematic chunk.

For instance, if the chunk named 'chapter_qa2' was saved last before the error occurred, it indicates that the 3rd chunk encountered issues. In that case, you can adjust the start index of the for loop to '4' and run the code. If similar errors occur again, you can repeat this process. If the above code runs without any errors, this code can be skipped.

In [None]:
for i in range(5, len(chunks) + 1):
  start_index = i * (chunk_size - overlap)
  end_index = start_index + chunk_size
  if end_index > max_index:
    end_index = max_index
  chunk = script_content[start_index:end_index]
  #print(i)
  chapter_qa = generate_qa_from_chapter(chunk)
  save_qa_to_json(chapter_qa, 'chapter_qa'+str(i)+'.json', chunk)
  print('Chapter QA saved to chapter_qa'+str(i)+'.json')
  print(max_index-start_index)


#To Merge All JSON Files into a Single JSON for the Entire Chapter

Adjust the following settings before running:



*   Chapter_number:""
*   Chapter_name: ""
*   output file_name (set according to your convinience)




In [None]:
import json
import os

def merge_json_files(file_paths, output_file):
    merged_data = []
    count = 0
    for path in file_paths:
        with open(path, 'r') as file:
            data = json.load(file)
            count = count + (len(data['questions']))
            merged_data.append(data)
    print(f"The number of question-answer pairs in this chapter = {count}")
    merged_json= {
        "chapter_number": "02",
        "chapter_name": "MATHEMATICALMODELLING",
        "question_answers": merged_data  # Store the merged data under a 'data' key
    }
    with open(output_file, 'w') as outfile:
        json.dump(merged_json, outfile, indent=4)

files = [f for f in os.listdir('/content') if os.path.isfile(os.path.join('/content', f)) and f.lower().endswith(('.json'))]

merge_json_files(files, '/content/ch-02.json')


The number of question-answer pairs in this chapter = 16


#To download the merged JSON

In [None]:
from google.colab import files

files.download('/content/ch-a2.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#To Clear All JSON and HTML Files in the /content Directory

Run this code to clean up the /content folder before starting a new QA generation process.

In [None]:
import os

# Define the directory to clean
directory = '/content'

# Iterate through all files in the directory
for filename in os.listdir(directory):
    # Check if the file is a JSON or HTML file
    if filename.endswith((".json", ".html")):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Attempt to delete the file
        try:
            os.remove(file_path)
            print(f"Deleted: {filename}")
        except OSError as e:
            print(f"Error deleting {filename}: {e}")

Deleted: chapter_qa1.json
Deleted: chapter_qa0.json
Deleted: merged_output.html
Deleted: chapter_qa2.json
Deleted: ch-a2.json


In [15]:
!cd afeefapp/Enhanced_ASAG

/bin/bash: line 1: cd: afeefapp/Enhanced_ASAG: No such file or directory
