In [18]:
import json
import requests
import time
import re
from typing import Dict, List, Any
from collections import defaultdict

class EducationalQAGenerator:
    def __init__(self, ollama_url="http://localhost:11434", model_name="gemma3n:e4b"):
        self.ollama_url = ollama_url
        self.model_name = model_name
        self.generated_qa = []
        
    def format_educational_data(self, record: Dict) -> tuple:
        """Format educational data for prompt"""
        main_heading = record.get("main_heading", "")
        sub_heading = record.get("sub_heading", "")
        content = record.get("content", "")
        page_start = record.get("page_start", "")
        page_end = record.get("page_end", "")
        
        return main_heading, sub_heading, content, page_start, page_end
    
    def generate_subtopic_prompt(self, record: Dict) -> str:
        """Generate prompt for sub-topic level (5 Q&A pairs)"""
        main_heading, sub_heading, content, page_start, page_end = self.format_educational_data(record)
        
        prompt = f"""You are creating educational Q&A pairs for a learning app that helps students understand academic concepts. Based on this educational content, generate exactly 5 question-answer pairs that would help a student learn about this topic.

Each question must include clear reference to the subject matter and be educational in nature.

Educational Content:
Main Topic: {main_heading}
Sub-topic: {sub_heading}
Pages: {page_start}-{page_end}
Content: {content}

Generate 5 Q&A pairs in this format:
Q1: [clear educational question about the content]
A1: [comprehensive answer with specific information]

Q2: [another educational question about the content]
A2: [comprehensive answer with specific information]

Q3: [question about key concepts or definitions]
A3: [comprehensive answer with specific information]

Q4: [question about examples or applications]
A4: [comprehensive answer with specific information]

Q5: [question about classification or comparison]
A5: [comprehensive answer with specific information]

Make questions clear and educational. Cover different aspects: definitions, examples, classifications, processes, and practical applications. Make answers comprehensive and student-friendly."""
        
        return prompt
    
    def generate_topic_prompt(self, topic_records: List[Dict]) -> str:
        """Generate prompt for topic level (30 Q&A pairs from all sub-topics)"""
        if not topic_records:
            return ""
            
        main_heading = topic_records[0].get("main_heading", "")
        
        # Combine all content from sub-topics
        combined_content = ""
        subtopics = []
        
        for record in topic_records:
            sub_heading = record.get("sub_heading", "")
            content = record.get("content", "")
            subtopics.append(sub_heading)
            combined_content += f"\n\n{sub_heading}:\n{content}"
        
        prompt = f"""You are creating comprehensive educational Q&A pairs for a learning app. Based on this complete topic content covering multiple sub-topics, generate exactly 30 question-answer pairs that would help students master this entire topic.

Each question must be educational and cover the breadth of the topic comprehensively.

Main Topic: {main_heading}
Sub-topics covered: {', '.join(subtopics)}

Complete Content: {combined_content}

Generate 30 Q&A pairs in this format:
Q1: [comprehensive question about the topic]
A1: [detailed answer with specific information]

Q2: [another comprehensive question]
A2: [detailed answer with specific information]

Continue this pattern for all 30 pairs. Cover the full scope of the topic including:
- Key definitions and concepts
- Classifications and types
- Processes and procedures
- Examples and applications
- Comparisons and relationships
- Problem-solving scenarios
- Practical applications
- Cross-connections between sub-topics

Make questions comprehensive and answers detailed for complete topic mastery."""
        
        return prompt
    
    def query_ollama(self, prompt: str, max_retries: int = 10) -> str:
        """Query Ollama API with retry logic"""
        for attempt in range(max_retries):
            try:
                response = requests.post(
                    f"{self.ollama_url}/api/generate",
                    json={
                        "model": self.model_name,
                        "prompt": prompt,
                        "stream": False,
                        "options": {
                            "temperature": 0.7,
                            "top_p": 0.9,
                            "max_tokens": 3000  # Increased for 30 Q&A pairs
                        }
                    },
                    timeout=180  # 3 minutes timeout for longer responses
                )
                
                if response.status_code == 200:
                    result = response.json()
                    return result.get("response", "")
                else:
                    print(f"HTTP Error {response.status_code}: {response.text}")
                    
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < max_retries - 1:
                    time.sleep(60)
                
        return ""
    
    def parse_qa_response(self, response: str) -> List[Dict[str, str]]:
        """Parse Q&A pairs from response"""
        qa_pairs = []
        
        # Split by Q patterns
        questions = re.split(r'Q\d+:', response)[1:]
        
        for i, q_section in enumerate(questions, 1):
            # Split question and answer
            parts = re.split(r'A\d+:', q_section, 1)
            if len(parts) == 2:
                question = parts[0].strip()
                answer = parts[1].strip()
                
                # Clean up answer (remove next question if present)
                answer = re.split(r'Q\d+:', answer)[0].strip()
                
                if question and answer:
                    qa_pairs.append({
                        "question": question,
                        "answer": answer
                    })
        
        return qa_pairs
    
    def generate_qa_for_subtopic(self, record: Dict) -> Dict:
        """Generate 5 Q&A pairs for a single sub-topic"""
        main_heading, sub_heading, content, page_start, page_end = self.format_educational_data(record)
        
        print(f"\n📖 Processing sub-topic: {sub_heading}")
        
        if not content.strip():
            print(f"⚠️  Skipping {sub_heading} - no content found")
            return None
        
        prompt = self.generate_subtopic_prompt(record)
        response = self.query_ollama(prompt)
        
        if not response:
            print(f"❌ Failed to generate Q&A for {sub_heading}")
            return None
        
        qa_pairs = self.parse_qa_response(response)
        
        if not qa_pairs:
            print(f"❌ Failed to parse Q&A for {sub_heading}")
            return None
        
        print(f"✅ Generated {len(qa_pairs)} Q&A pairs for {sub_heading}")
        
        return {
            "main_heading": main_heading,
            "sub_heading": sub_heading,
            "qa_pairs": qa_pairs,
            "level": "subtopic",
            "page_range": f"{page_start}-{page_end}",
            "content_length": len(content)
        }
    
    def generate_qa_for_topic(self, topic_records: List[Dict]) -> Dict:
        """Generate 30 Q&A pairs for entire topic (combining all sub-topics)"""
        if not topic_records:
            return None
            
        main_heading = topic_records[0].get("main_heading", "")
        print(f"\n📚 Processing main topic: {main_heading}")
        print(f"📋 Combining {len(topic_records)} sub-topics")
        
        prompt = self.generate_topic_prompt(topic_records)
        response = self.query_ollama(prompt)
        
        if not response:
            print(f"❌ Failed to generate Q&A for topic {main_heading}")
            return None
        
        qa_pairs = self.parse_qa_response(response)
        
        if not qa_pairs:
            print(f"❌ Failed to parse Q&A for topic {main_heading}")
            return None
        
        print(f"✅ Generated {len(qa_pairs)} Q&A pairs for topic {main_heading}")
        
        subtopic_list = [record.get("sub_heading", "") for record in topic_records]
        
        return {
            "main_heading": main_heading,
            "sub_headings": subtopic_list,
            "qa_pairs": qa_pairs,
            "level": "topic",
            "subtopic_count": len(topic_records),
            "total_content_length": sum(len(record.get("content", "")) for record in topic_records)
        }
    
    def process_all_records(self, json_file_path: str, output_file_path: str):
        """Process all records - generate both sub-topic and topic level Q&A"""
        print(f"📚 Loading data from: {json_file_path}")
        
        try:
            with open(json_file_path, 'r', encoding='utf-8') as f:
                records = json.load(f)
        except Exception as e:
            print(f"❌ Error loading JSON file: {e}")
            return
        
        print(f"📊 Found {len(records)} records to process")
        
        # Group records by main_heading
        topics_grouped = defaultdict(list)
        for record in records:
            main_heading = record.get("main_heading", "Unknown")
            topics_grouped[main_heading].append(record)
        
        print(f"📋 Found {len(topics_grouped)} main topics")
        
        all_qa_data = []
        successful_subtopics = 0
        successful_topics = 0
        
        # Process each main topic
        for main_heading, topic_records in topics_grouped.items():
            print(f"\n🎯 Processing main topic: {main_heading}")
            print(f"   └── Contains {len(topic_records)} sub-topics")
            
            # 1. Generate Q&A for each sub-topic (5 each)
            for i, record in enumerate(topic_records):
                print(f"\n📍 Sub-topic progress: {i+1}/{len(topic_records)}")
                
                subtopic_qa = self.generate_qa_for_subtopic(record)
                if subtopic_qa:
                    all_qa_data.append(subtopic_qa)
                    successful_subtopics += 1
                
                time.sleep(2)  # Rate limiting
            
            # 2. Generate Q&A for entire topic (30 total)
            topic_qa = self.generate_qa_for_topic(topic_records)
            if topic_qa:
                all_qa_data.append(topic_qa)
                successful_topics += 1
            
            time.sleep(3)  # Longer pause between topics
        
        # Save results
        try:
            with open(output_file_path, 'w', encoding='utf-8') as f:
                json.dump(all_qa_data, f, indent=2, ensure_ascii=False)
            
            print(f"\n🎉 SUCCESS!")
            print(f"✅ Processed {successful_subtopics} sub-topics successfully")
            print(f"✅ Processed {successful_topics} main topics successfully")
            print(f"💾 Saved to: {output_file_path}")
            
            # Calculate total Q&A pairs
            total_qa = sum(len(item["qa_pairs"]) for item in all_qa_data)
            print(f"📊 Total Q&A pairs generated: {total_qa}")
            
        except Exception as e:
            print(f"❌ Error saving results: {e}")


## test

In [12]:

# # Usage example:
# if __name__ == "__main__":
qa_generator = EducationalQAGenerator()

    
# Or test single sub-topic:
sample_record = {
    "main_heading": "CROP PRODUCTION AND MANAGEMENT",
    "sub_heading": "1.1 Agricultural Practices",
    "content": " Till 10,000 B.C.E. people were nomadic. They were wandering in groups from place to place in search of food and shelter. They ate raw fruits and vegetables and started hunting animals for food. Later, they could cultivate land and produce rice, wheat and other food crops. Thus, was born ‘Agriculture’. When plants of the same kind are cultivated at one place on a large scale, it is called a  crop . For example, crop of wheat means that all the plants grown in a field are that of wheat. You already know that crops are of different types like cereals, vegetables and fruits. These can be classified on the basis of the season in which they grow. India is a vast country. The climatic conditions like temperature, humidity and rainfall vary from one region to another. Accordingly, there is a rich You have learnt that all living organisms require food. Plants can make their food themselves. Can you recall how green plants synthesise their own food? Animals including humans can not make their own food. So, where do animals get their food from? But, first of all why do we have to eat food? You already know that energy from the food is utilised by organisms for carrying out their various body functions, such as digestion, respiration and excretion. We get our food from plants, or animals, or both. Since we all need food, how can we provide food to a large number of people in our country? Food has to be produced on a large scale. I want to know where and how we use these tools. 2018-19 SCIENCE 2 variety of crops grown in different parts of the country. Despite this diversity, two broad cropping patterns can be identified. These are: (i) Kharif Crops :  The crops which are sown in the rainy season are called kharif crops. The rainy season in India is generally from June to September. Paddy, maize, soyabean, groundnut and cotton are kharif crops. (ii) Rabi Crops :  The crops grown in the winter season (October to March) are called rabi crops. Examples of rabi crops are wheat, gram, pea, mustard and linseed. Besides these, pulses and vegetables are grown during summer at many places.",
    "page_start": 14,
    "page_end": 15
}

test_result = qa_generator.generate_qa_for_subtopic(sample_record)


📖 Processing sub-topic: 1.1 Agricultural Practices
✅ Generated 5 Q&A pairs for 1.1 Agricultural Practices


In [None]:
test_result

In [14]:
test_result = qa_generator.generate_qa_for_topic([sample_record])


📚 Processing main topic: CROP PRODUCTION AND MANAGEMENT
📋 Combining 1 sub-topics
✅ Generated 30 Q&A pairs for topic CROP PRODUCTION AND MANAGEMENT


In [None]:
test_result

In [19]:
# Initialize the generator
qa_generator = EducationalQAGenerator()
    
# Process all depression records
input_file = "/Users/saikumarallaka/kaggle/gemma_3n_impact_challenge/datasets/education/education_structured_data_extract.json"  # Your input file
output_file = "/Users/saikumarallaka/kaggle/gemma_3n_impact_challenge/datasets/education/education_qa_dataset.json"  # Output file

qa_generator.process_all_records(input_file, output_file)

📚 Loading data from: /Users/saikumarallaka/kaggle/gemma_3n_impact_challenge/datasets/education/education_structured_data_extract.json
📊 Found 137 records to process
📋 Found 18 main topics

🎯 Processing main topic: CROP PRODUCTION   AND MANAGEMENT
   └── Contains 11 sub-topics

📍 Sub-topic progress: 1/11

📖 Processing sub-topic: 1.1 Agricultural Practices
✅ Generated 5 Q&A pairs for 1.1 Agricultural Practices

📍 Sub-topic progress: 2/11

📖 Processing sub-topic: 1.2 Basic Practices of Crop Production
✅ Generated 5 Q&A pairs for 1.2 Basic Practices of Crop Production

📍 Sub-topic progress: 3/11

📖 Processing sub-topic: 1.3 Preparation of Soil
✅ Generated 5 Q&A pairs for 1.3 Preparation of Soil

📍 Sub-topic progress: 4/11

📖 Processing sub-topic: 1.4 Sowing
✅ Generated 5 Q&A pairs for 1.4 Sowing

📍 Sub-topic progress: 5/11

📖 Processing sub-topic: 1.5 Adding Manure  and Fertilisers
✅ Generated 5 Q&A pairs for 1.5 Adding Manure  and Fertilisers

📍 Sub-topic progress: 6/11

📖 Processing sub-

In [20]:
1

1