In [None]:
!pip install -q google-generativeai langchain tiktoken

In [None]:
!pip install openai==0.28



In [None]:
# 2. Imports
import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import files
import os
import json
import pandas as pd
import re
import time
import openai

In [None]:
# Setup Gemini API
os.environ["GOOGLE_API_KEY"] = "GOOGLE_API_KEY"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
model = genai.GenerativeModel("models/gemini-1.5-flash")


In [None]:
# Setup GPT04 API

# os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"
# openai.api_key = os.environ["OPENAI_API_KEY"]



In [None]:
# Upload documents
uploaded = files.upload()
eng_text = open("/content/extractedEng_text.txt", encoding="utf-8").read()
ara_text = open("/content/extractedAra_text.txt", encoding="utf-8").read()



Saving extractedAra_text.txt to extractedAra_text (1).txt
Saving extractedEng_text.txt to extractedEng_text (1).txt


In [None]:
# Chunk documents
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=75)
english_chunks = splitter.split_text(eng_text)
arabic_chunks = splitter.split_text(ara_text)

In [None]:

def prompt_en(context):
    return f"""
You are a helpful assistant generating test questions for a retrieval-based system used by pilgrims during Hajj and Umrah.

Generate 4 types of questions from the text:
1. Conditional (e.g., "If a pilgrim forgets to do X, what should they do?")
2. Reasoning (e.g., "Why is it important to do X during Y?")
3. Scenario (e.g., "I am performing Umrah and does X. What should I do next?")
4. Simple factual (e.g., "How many rounds are in Tawaf?")

Make sure:
- All questions can be answered strictly based on the text.
- Avoid questions that refer to the document itself like what is the purpose of this document or where can i find this information in the document or general advice not found in the text.
-Do not generate questions out of the documents and Do not generate questions there answers not in the documents

Return in JSON format like this:
{{
  "questions": [
    {{"type": "conditional", "question": "...", "answer": "..."}},
    {{"type": "reasoning", "question": "...", "answer": "..."}},
    {{"type": "scenario", "question": "...", "answer": "..."}},
    {{"type": "simple", "question": "...", "answer": "..."}}
  ]
}}

Text:
\"\"\"{context}\"\"\"
"""

def prompt_ar(context):
    return f"""
أنت مساعد ذكي يساعد في توليد أنواع مختلفة من الأسئلة من النصوص الخاصة بالحج والعمرة.

أنشئ ٤ أنواع من الأسئلة بناءً على النص:
1. شرطية (مثال: "إذا نسي الحاج أن يفعل كذا، ماذا يجب أن يفعل؟")
2. استنتاجية (مثال: "لماذا يعتبر فعل كذا مهماً أثناء كذا؟")
3. سيناريو (مثال: "أنا أؤدي العمرة وقمت بـفعل كذا . ماذا يجب أن أفعل بعد ذلك؟")
4. بسيطة مباشرة (مثال: "كم عدد أشواط الطواف؟")

الشروط:
يمكن الإجابة على جميع الأسئلة اعتمادًا فقط على النص.

تجنّب الأسئلة التي تشير إلى الوثيقة نفسها مثل: ما الهدف من هذه الوثيقة؟ أو أين يمكنني العثور على هذه المعلومات في الوثيقة؟ أو نصائح عامة غير موجودة في النص.

لا تُنشئ أسئلة من خارج الوثيقة، ولا تُنشئ أسئلة لا توجد إجاباتها داخل الوثيقة.



أعد النتيجة بصيغة JSON كالتالي:
{{
  "questions": [
    {{"type": "conditional", "question": "...", "answer": "..."}},
    {{"type": "reasoning", "question": "...", "answer": "..."}},
    {{"type": "scenario", "question": "...", "answer": "..."}},
    {{"type": "simple", "question": "...", "answer": "..."}}
  ]
}}

النص:
\"\"\"{context}\"\"\"
"""

In [None]:

# Helper: clean Gemini markdown output
def extract_json(text):
    match = re.search(r"```json(.*?)```", text, re.DOTALL)
    return match.group(1).strip() if match else text.strip()

# Helper: filter weak QAs
def is_valid(qa):
    q, a = qa.get("question", "").strip(), qa.get("answer", "").strip()
    return (
        len(q) > 8 and "not available in the text" not in a.lower() and
        "check the guide" not in a.lower() and
        len(a) > 10
    )


In [None]:

#Gemeni Generarion
results = []
def generate_qa(chunks, lang_code, prompt_fn, start=0, limit=20):
    for i, chunk in enumerate(chunks[start:start + limit]):
        prompt = prompt_fn(chunk)
        try:
            response = model.generate_content(prompt)
            cleaned = extract_json(response.text)
            parsed = json.loads(cleaned)
            for qa in parsed.get("questions", []):
                if is_valid(qa):
                    results.append({
                        "question": qa["question"],
                        "answer": qa["answer"],
                        "context": chunk,
                        "language": "arabic" if lang_code == "ar" else "english",
                        "type": qa.get("type", "unknown"),
                        "model": "gemini-1.5-flash"
                    })
            print(f"✅ Valid QAs added from {lang_code} chunk {i+1}/{limit}")
        except Exception as e:
            print(f"❌ Error ({lang_code} chunk {i+1}):", e)
        time.sleep(2)  # avoid quota spikes

In [None]:
#GPT Generation

# os.environ["OPENAI_API_KEY"] = "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"
# openai.api_key = os.environ["OPENAI_API_KEY"]

# results = []

# def generate_qa(chunks, lang_code, prompt_fn, limit=50):
#     for i, chunk in enumerate(chunks[:limit]):
#         prompt = prompt_fn(chunk)
#         try:
#             response = openai.ChatCompletion.create(
#                 model="gpt-4",
#                 messages=[
#                     {"role": "system", "content": "You are a helpful assistant that generates question-answer pairs in JSON format."},
#                     {"role": "user", "content": prompt}
#                 ]
#             )
#             response_text = response["choices"][0]["message"]["content"]

#             cleaned = extract_json(response_text)
#             parsed = json.loads(cleaned)

#             for qa in parsed.get("questions", []):
#                 if is_valid(qa):
#                     results.append({
#                         "question": qa["question"],
#                         "answer": qa["answer"],
#                         "context": chunk,
#                         "language": "arabic" if lang_code == "ar" else "english",
#                         "type": qa.get("type", "unknown"),
#                         "model": "gpt-4"
#                     })
#             print(f"✅ Valid QAs added from {lang_code} chunk {i+1}/{limit}")

#         except Exception as e:
#             print(f"❌ Error ({lang_code} chunk {i+1}):", e)

#         time.sleep(2)


In [None]:
# Run both languages

#GPT
# generate_qa(english_chunks, "en", prompt_en, limit=20) #first 20 chunks
# generate_qa(arabic_chunks, "ar", prompt_ar, limit=20) #first 20 chunks


#Gemeni
generate_qa(english_chunks, "en", prompt_en,start=20, limit=20) # chunks 20–39
generate_qa(arabic_chunks, "ar", prompt_ar,start=20, limit=20) # chunks 20–39



✅ Valid QAs added from en chunk 1/20
✅ Valid QAs added from en chunk 2/20
✅ Valid QAs added from en chunk 3/20
✅ Valid QAs added from en chunk 4/20
✅ Valid QAs added from en chunk 5/20
✅ Valid QAs added from en chunk 6/20
✅ Valid QAs added from en chunk 7/20
✅ Valid QAs added from en chunk 8/20
✅ Valid QAs added from en chunk 9/20
✅ Valid QAs added from en chunk 10/20
✅ Valid QAs added from en chunk 11/20
✅ Valid QAs added from en chunk 12/20
✅ Valid QAs added from en chunk 13/20
✅ Valid QAs added from en chunk 14/20
✅ Valid QAs added from en chunk 15/20
✅ Valid QAs added from en chunk 16/20
✅ Valid QAs added from en chunk 17/20
✅ Valid QAs added from en chunk 18/20
✅ Valid QAs added from en chunk 19/20
✅ Valid QAs added from en chunk 20/20
✅ Valid QAs added from ar chunk 1/20
✅ Valid QAs added from ar chunk 2/20
✅ Valid QAs added from ar chunk 3/20
✅ Valid QAs added from ar chunk 4/20
✅ Valid QAs added from ar chunk 5/20
✅ Valid QAs added from ar chunk 6/20
✅ Valid QAs added from ar c

In [None]:
#Save the output using only JSON format
import json

with open("TestsetGemeni2.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)


files.download("TestsetGemeni2.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>