<a href="https://colab.research.google.com/github/ahteshamsalamatansari/colabcodes/blob/main/New_Question_Bank_28_00.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Reading Comprehension Question Generator for Google Colab
# Enhanced version with natural question patterns and proper formatting

import json
import pandas as pd
import hashlib
import re
import openai
import time
import os
import random
from datetime import datetime
from google.colab import files, drive
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from tqdm import tqdm
import gc
import psutil

# Configuration and Setup
class Config:
    def __init__(self):
        self.api_key = None
        self.model = "gpt-3.5-turbo"
        self.max_workers = 5  # Optimize for speed while avoiding rate limits
        self.batch_size = 100  # Process in batches to manage memory
        self.max_retries = 3
        self.retry_delay = 2

config = Config()

# Get API key and file settings
print("🚀 Enhanced Reading Comprehension Question Generator")
print("=" * 50)

# Input configurations
config.api_key = input("Enter your OpenAI API key: ").strip()

# Initialize OpenAI client (works with both old and new versions)
try:
    # Try new OpenAI v1.0+ client
    from openai import OpenAI
    openai_client = OpenAI(api_key=config.api_key)
    use_new_api = True
    print("✅ Using OpenAI v1.0+ API")
except ImportError:
    # Fallback to old API
    openai.api_key = config.api_key
    openai_client = openai
    use_new_api = False
    print("✅ Using OpenAI legacy API")

output_filename = input("Enter output filename (without extension): ").strip()
if not output_filename:
    output_filename = "reading_comprehension_output"

# Mount Google Drive
print("\n📁 Mounting Google Drive...")
drive.mount('/content/drive')

# Enhanced question patterns with much more variety
QUESTION_STARTERS = {
    "detail_comprehension": [
        "Based on the passage, what",
        "The text states that",
        "According to the information provided",
        "The passage mentions that",
        "From the article, we learn that",
        "The writer indicates that",
        "The text reveals that",
        "As described in the passage",
        "The article explains that",
        "The passage clearly shows that",
        "We can determine from the text that",
        "The information suggests that",
        "The content indicates that",
        "From what is written",
        "The passage describes",
        "The text identifies",
        "According to the details given",
        "The article specifies that",
        "The writing confirms that",
        "Based on the facts presented"
    ],
    "inference_reasoning": [
        "It can be concluded that",
        "The passage suggests that",
        "We can infer that",
        "The text implies that",
        "One can reasonably assume that",
        "The evidence points to",
        "The information leads us to believe that",
        "Based on the context",
        "The underlying message is that",
        "The passage hints that",
        "We might deduce that",
        "The text indicates that",
        "From the evidence presented",
        "The overall tone suggests that",
        "Reading between the lines",
        "The subtext reveals that",
        "The implications are that",
        "The logical conclusion is that",
        "The passage conveys that",
        "The deeper meaning suggests"
    ],
    "opinion_attitude": [
        "The author's perspective on",
        "The writer's viewpoint regarding",
        "The author feels that",
        "The writer's stance on",
        "The author's opinion about",
        "The text reflects a",
        "The author demonstrates",
        "The writer shows",
        "The author's attitude toward",
        "The writer expresses",
        "The author believes that",
        "The text conveys the author's",
        "The writer's position is that",
        "The author's approach to",
        "The writer regards",
        "The author considers",
        "The text shows the author",
        "The writer's judgment about",
        "The author's take on",
        "The writer's assessment of"
    ],
    "main_idea": [
        "The central theme of this passage is",
        "This text primarily focuses on",
        "The main point of the passage is",
        "The passage is essentially about",
        "The primary subject of this text is",
        "The core message of the passage is",
        "This article mainly discusses",
        "The fundamental topic is",
        "The passage centers on",
        "The key focus of the text is",
        "The predominant theme is",
        "The passage's main concern is",
        "The central focus involves",
        "The primary emphasis is on",
        "The text is fundamentally about",
        "The overarching theme is",
        "The passage mainly addresses",
        "The principal subject matter is",
        "The text chiefly examines",
        "The main thrust of the passage is"
    ]
}

# Analysis cleaning patterns - phrases to remove from the start of analysis
ANALYSIS_PHRASES_TO_REMOVE = [
    r'^This option is correct as the passage explains that\s*',
    r'^This is correct because\s*',
    r'^The passage supports this by\s*',
    r'^The text clearly indicates that\s*',
    r'^Based on the information provided,?\s*',
    r'^The author explicitly mentions that\s*',
    r'^The passage demonstrates that\s*',
    r'^The evidence shows that\s*',
    r'^The text confirms that\s*',
    r'^The article establishes that\s*',
    r'^The writing reveals that\s*',
    r'^The content supports this through\s*',
    r'^The passage provides evidence that\s*',
    r'^The information confirms that\s*',
    r'^The text validates this by\s*',
    r'^The author\'s words support this because\s*',
    r'^The passage substantiates this through\s*',
    r'^The evidence points to this since\s*',
    r'^The text backs this up by\s*',
    r'^The information corroborates this because\s*',
    r'^The passage verifies this through\s*',
    r'^The answer is [A-D] because\s*',
    r'^[A-D] is correct because\s*',
    r'^This option is correct because\s*',
    r'^This answer is right because\s*',
    r'^The correct answer is [A-D] because\s*',
    r'^Option [A-D] is correct as\s*',
    r'^This choice is right because\s*'
]

# Domain classification function
def classify_domain(text):
    """Classify article domain based on content keywords"""
    domain_keywords = {
        "Technology": ["technology", "tech", "digital", "AI", "software", "internet", "computer", "smartphone", "app"],
        "Science": ["research", "study", "scientist", "experiment", "discovery", "medical", "health", "biology"],
        "Sports": ["sport", "athlete", "game", "competition", "team", "player", "championship", "olympic"],
        "Business": ["business", "company", "market", "economy", "finance", "investment", "corporate", "industry"],
        "Politics": ["government", "political", "policy", "election", "politician", "law", "congress", "president"],
        "Entertainment": ["movie", "music", "celebrity", "film", "actor", "entertainment", "show", "television"],
        "Education": ["school", "student", "education", "teacher", "university", "learning", "academic", "classroom"],
        "Environment": ["environment", "climate", "nature", "pollution", "sustainable", "green", "ecology", "conservation"],
        "History": ["history", "historical", "ancient", "past", "century", "era", "civilization", "culture"],
        "Literature": ["book", "author", "novel", "poetry", "literature", "writing", "story", "literary"]
    }

    text_lower = text.lower()
    domain_scores = {}

    for domain, keywords in domain_keywords.items():
        score = sum(text_lower.count(keyword) for keyword in keywords)
        if score > 0:
            domain_scores[domain] = score

    return max(domain_scores, key=domain_scores.get) if domain_scores else "General"

# Stage classification based on text complexity
def classify_stage(text):
    """Classify educational stage based on text complexity"""
    word_count = len(text.split())
    avg_sentence_length = word_count / max(text.count('.') + text.count('!') + text.count('?'), 1)

    # Simple heuristic based on length and complexity
    if word_count < 200 or avg_sentence_length < 15:
        return "Middle School"
    else:
        return "High School"

# Clean and anonymize text while preserving formatting
def clean_and_anonymize_text(text):
    """Clean text and anonymize personal information while preserving line breaks"""
    if not text or pd.isna(text):
        return ""

    text = str(text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Anonymize patterns
    anonymize_patterns = [
        (r'\b\d{3}-\d{3}-\d{4}\b', 'xxx-xxx-xxxx'),  # Phone numbers
        (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'xxxx@xxxx.xxx'),  # Email
        (r'\b\d{3,4}\s?\d{3,4}\s?\d{4}\b', 'xxxxxxxxxxxx'),  # Various phone formats
        (r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', 'xxxx-xxxx-xxxx-xxxx'),  # Credit card format
    ]

    for pattern, replacement in anonymize_patterns:
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    # PRESERVE original line breaks and spacing - only clean excessive whitespace
    # Remove only excessive line breaks (4 or more in a row)
    text = re.sub(r'\n{4,}', '\n\n\n', text)
    # Remove excessive spaces within lines (3 or more spaces become 2)
    text = re.sub(r'[ \t]{3,}', '  ', text)
    # Clean up trailing/leading whitespace on each line while preserving line breaks
    lines = text.split('\n')
    cleaned_lines = [line.rstrip() for line in lines]  # Remove trailing spaces from each line
    text = '\n'.join(cleaned_lines)

    # Only strip the very beginning and end
    text = text.strip()

    return text

# Generate hash ID
def generate_hash_id(text):
    """Generate unique hash ID for the article"""
    return hashlib.md5(text.encode('utf-8')).hexdigest()

# Enhanced answer extraction and formatting
def extract_clean_answer(answer_text):
    """Extract just the letter from answer field, removing explanations"""
    if not answer_text:
        return ""

    # Clean the answer text
    answer_text = str(answer_text).strip()

    # Look for single letter answers (A, B, C, D) at the beginning
    match = re.match(r'^([A-D])\b', answer_text.upper())
    if match:
        return match.group(1)

    # If no clear match, try to find any A, B, C, or D
    letters = re.findall(r'\b([A-D])\b', answer_text.upper())
    if letters:
        return letters[0]

    return ""

# Enhanced options formatting
def format_options_with_labels(options_list):
    """Format options with proper A, B, C, D labels"""
    if not options_list or not isinstance(options_list, list):
        return []

    formatted_options = []
    labels = ['A', 'B', 'C', 'D']

    for i, option in enumerate(options_list[:4]):  # Max 4 options
        if i < len(labels):
            # Clean the option text and add proper label
            clean_option = str(option).strip()
            # Remove any existing A), B), etc. labels
            clean_option = re.sub(r'^[A-D][).]\s*', '', clean_option)
            formatted_options.append(f"{labels[i]}. {clean_option}")

    return formatted_options

# OpenAI API call with retry logic
def call_openai_with_retry(messages, max_retries=3):
    """Call OpenAI API with retry logic for rate limiting"""
    for attempt in range(max_retries):
        try:
            if use_new_api:
                # New OpenAI v1.0+ API
                response = openai_client.chat.completions.create(
                    model=config.model,
                    messages=messages,
                    max_tokens=2000,
                    temperature=0.8  # Increased for more variety
                )
                return response.choices[0].message.content.strip()
            else:
                # Legacy OpenAI API
                response = openai_client.ChatCompletion.create(
                    model=config.model,
                    messages=messages,
                    max_tokens=2000,
                    temperature=0.8  # Increased for more variety
                )
                return response.choices[0].message.content.strip()
        except Exception as e:
            error_str = str(e).lower()
            if "rate" in error_str or "limit" in error_str or "429" in error_str:
                if attempt < max_retries - 1:
                    wait_time = (2 ** attempt) * config.retry_delay
                    print(f"Rate limit hit, waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    raise
            elif attempt < max_retries - 1:
                print(f"Error occurred: {str(e)}, retrying...")
                time.sleep(config.retry_delay)
            else:
                raise
    return None

# Generate questions for a single article
def generate_questions_for_article(article_data, article_index):
    """Generate 5-7 MC questions for a single article with natural variety"""
    try:
        article_text, source_url = article_data

        # Clean and prepare text
        clean_text = clean_and_anonymize_text(article_text)
        if not clean_text or len(clean_text.split()) < 50:
            return None

        # Classify domain and stage
        domain = classify_domain(clean_text)
        stage = classify_stage(clean_text)

        # Randomly select question starters for variety
        detail_starters = random.sample(QUESTION_STARTERS["detail_comprehension"], 2)
        inference_starters = random.sample(QUESTION_STARTERS["inference_reasoning"], 2)
        opinion_starters = random.sample(QUESTION_STARTERS["opinion_attitude"], 2)
        main_idea_starter = random.choice(QUESTION_STARTERS["main_idea"])

        # Create enhanced prompt with natural variety
        prompt = f"""
You are a professional educator creating reading comprehension questions. Create EXACTLY 7 multiple choice questions based on this article. Make each question sound natural and unique - avoid repetitive patterns.

ARTICLE:
{clean_text}

Create questions using these varied approaches:

DETAIL COMPREHENSION (2 questions) - Use these starters:
1. "{detail_starters[0]}..."
2. "{detail_starters[1]}..."

INFERENCE & REASONING (2 questions) - Use these starters:
3. "{inference_starters[0]}..."
4. "{inference_starters[1]}..."

OPINION & ATTITUDE (2 questions) - Use these starters:
5. "{opinion_starters[0]}..."
6. "{opinion_starters[1]}..."

MAIN IDEA (1 question) - Use this starter:
7. "{main_idea_starter}..."

CRITICAL REQUIREMENTS:
- Questions must sound natural and conversational
- Base ALL questions on specific content from this article
- Provide 4 options for each question (will be labeled A, B, C, D automatically)
- Answer field should contain ONLY the letter (A, B, C, or D)
- Make wrong answers plausible but clearly incorrect
- Vary your language and avoid repetitive phrases
- Analysis should be ONLY the direct explanation - NO introductory phrases like "This is correct because" or "The passage shows that" - just give the pure explanation

Format as JSON:
[
  {{
    "type": "detail_comprehension",
    "question": "Complete question using the starter provided...",
    "options": ["First option", "Second option", "Third option", "Fourth option"],
    "answer": "B",
    "analysis": "The company expanded operations to five new countries last year according to the financial report."
  }}
]
"""

        messages = [
            {"role": "system", "content": "You are an expert educator creating diverse, natural-sounding reading comprehension questions. Avoid repetitive patterns and make each question unique while maintaining educational value."},
            {"role": "user", "content": prompt}
        ]

        response = call_openai_with_retry(messages)

        if not response:
            return None

        # Parse JSON response
        try:
            questions_data = json.loads(response)
        except json.JSONDecodeError:
            # Fallback: try to extract JSON from response
            json_match = re.search(r'\[.*\]', response, re.DOTALL)
            if json_match:
                try:
                    questions_data = json.loads(json_match.group())
                except:
                    return None
            else:
                return None

        # Process questions into required format
        processed_questions = []
        question_types = ["detail_comprehension", "detail_comprehension", "inference_reasoning",
                         "inference_reasoning", "opinion_attitude", "opinion_attitude", "main_idea"]

        for i, q_data in enumerate(questions_data[:7]):  # Max 7 questions
            try:
                hash_id = generate_hash_id(f"{clean_text}_{i}")
                word_count = len(clean_text.split())

                # Clean and format the answer
                raw_answer = q_data.get('answer', '')
                clean_answer = extract_clean_answer(raw_answer)

                # Format options with A, B, C, D labels
                raw_options = q_data.get('options', [])
                formatted_options = format_options_with_labels(raw_options)

                # Clean analysis - remove ALL introductory phrases, keep only pure explanation
                raw_analysis = q_data.get('analysis', '')
                clean_analysis = raw_analysis

                # Remove all common analysis starter phrases
                for pattern in ANALYSIS_PHRASES_TO_REMOVE:
                    clean_analysis = re.sub(pattern, '', clean_analysis, flags=re.IGNORECASE)

                # Clean up any remaining common patterns
                clean_analysis = re.sub(r'^(The|This|It)\s+(is\s+)?(correct|right|true)\s+(because|as|since)\s*', '', clean_analysis, flags=re.IGNORECASE)
                clean_analysis = re.sub(r'^(According to|Based on|From)\s+the\s+(passage|text|article),?\s*', '', clean_analysis, flags=re.IGNORECASE)

                # Clean up the result
                clean_analysis = clean_analysis.strip()

                # Ensure it starts with a capital letter if there's content
                if clean_analysis and not clean_analysis[0].isupper():
                    clean_analysis = clean_analysis[0].upper() + clean_analysis[1:]

                # If analysis is empty after cleaning, provide a minimal fallback
                if not clean_analysis.strip():
                    clean_analysis = "The passage provides specific information supporting this option."

                question_obj = {
                    "id": hash_id,
                    "text": clean_text,  # Original article text
                    "meta": {
                        "data_info": {
                            "lang": "en",
                            "source": source_url,
                            "type": "Reading Comprehension",
                            "processing_date": datetime.now().strftime("%Y-%m-%d"),
                            "Question": q_data.get('question', ''),
                            "Options": formatted_options,  # Properly formatted with A, B, C, D
                            "Answer": clean_answer,  # Just the letter
                            "Analysis": clean_analysis,  # Natural analysis without repetitive starters
                            "Question_Type": question_types[i] if i < len(question_types) else q_data.get('type', ''),
                            "word_count": word_count
                        },
                        "subject_info": {
                            "subject": "English",
                            "domain": domain,
                            "stage": stage
                        }
                    }
                }
                processed_questions.append(question_obj)
            except Exception as e:
                print(f"Error processing question {i} for article {article_index}: {str(e)}")
                continue

        return processed_questions

    except Exception as e:
        print(f"Error processing article {article_index}: {str(e)}")
        return None

# Main processing function
def process_articles_batch(articles_batch, start_index):
    """Process a batch of articles"""
    results = []

    with ThreadPoolExecutor(max_workers=config.max_workers) as executor:
        # Submit tasks
        future_to_index = {
            executor.submit(generate_questions_for_article, article_data, start_index + i): start_index + i
            for i, article_data in enumerate(articles_batch) if article_data and article_data[0] and not pd.isna(article_data[0])
        }

        # Collect results
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                questions = future.result(timeout=60)  # 60 second timeout
                if questions:
                    results.extend(questions)
            except Exception as e:
                print(f"Error processing article {index}: {str(e)}")

    return results

# Load and process CSV file
print("\n📄 Upload your CSV file with articles...")
uploaded = files.upload()

csv_filename = list(uploaded.keys())[0]
print(f"Processing: {csv_filename}")

# Load CSV
df = pd.read_csv(csv_filename)
print(f"Loaded {len(df)} articles")

# Detect columns (assume first column is articles, second is source URLs)
article_column = df.columns[0]
source_url_column = df.columns[1] if len(df.columns) > 1 else None

# Create paired data (article, source_url)
articles_data = []
for idx, row in df.iterrows():
    if pd.notna(row[article_column]):
        source_url = row[source_url_column] if source_url_column and pd.notna(row[source_url_column]) else "Exam Question Bank"
        articles_data.append((row[article_column], source_url))

print(f"Processing {len(articles_data)} valid articles with source URLs...")
print("Enhanced version will create more natural, varied questions. This may take several hours for 1M articles.")

# Process articles in batches
all_results = []
batch_size = config.batch_size

# Progress tracking
total_batches = (len(articles_data) + batch_size - 1) // batch_size
processed_articles = 0

print(f"\n🔄 Starting enhanced processing in {total_batches} batches...")

for batch_num in tqdm(range(0, len(articles_data), batch_size), desc="Processing batches"):
    batch = articles_data[batch_num:batch_num + batch_size]
    start_time = time.time()

    batch_results = process_articles_batch(batch, batch_num)
    all_results.extend(batch_results)

    processed_articles += len(batch)
    batch_time = time.time() - start_time

    # Memory management
    if batch_num % 10 == 0:  # Every 10 batches
        gc.collect()

    # Progress info
    questions_in_batch = len(batch_results)
    avg_time_per_article = batch_time / len(batch) if batch else 0

    print(f"Batch {batch_num//batch_size + 1}/{total_batches}: {questions_in_batch} questions generated in {batch_time:.1f}s (avg: {avg_time_per_article:.2f}s/article)")

    # Save intermediate results every 1000 articles
    if processed_articles % 1000 == 0:
        temp_filename = f"/content/drive/MyDrive/{output_filename}_temp_{processed_articles}.json"
        with open(temp_filename, 'w', encoding='utf-8') as f:
            json.dump(all_results, f, ensure_ascii=False, indent=2)
        print(f"💾 Saved intermediate results: {len(all_results)} questions")

print(f"\n✅ Enhanced processing complete! Generated {len(all_results)} questions from {processed_articles} articles")

# Save final results
if all_results:
    # Save JSON
    json_filename = f"/content/drive/MyDrive/{output_filename}.json"
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2)

    # Create CSV version with enhanced structure and preserved formatting
    csv_data = []
    for item in all_results:
        # Format options as a clean string
        options_str = " | ".join(item['meta']['data_info']['Options']) if item['meta']['data_info']['Options'] else ""

        # Preserve original text formatting in CSV
        preserved_text = item['text'].replace('\n', '\\n')  # Escape newlines for CSV

        flat_item = {
            'id': item['id'],
            'text': preserved_text,  # Preserved formatting with escaped newlines
            'lang': item['meta']['data_info']['lang'],
            'source': item['meta']['data_info']['source'],
            'type': item['meta']['data_info']['type'],
            'processing_date': item['meta']['data_info']['processing_date'],
            'question': item['meta']['data_info']['Question'],
            'options': options_str,  # Clean formatted options
            'answer': item['meta']['data_info']['Answer'],  # Just the letter
            'analysis': item['meta']['data_info']['Analysis'],  # Natural analysis
            'question_type': item['meta']['data_info'].get('Question_Type', ''),
            'word_count': item['meta']['data_info'].get('word_count', 0),
            'subject': item['meta']['subject_info']['subject'],
            'domain': item['meta']['subject_info']['domain'],
            'stage': item['meta']['subject_info']['stage']
        }
        csv_data.append(flat_item)

    csv_df = pd.DataFrame(csv_data)
    csv_filename = f"/content/drive/MyDrive/{output_filename}.csv"
    # Use quoting to preserve formatting
    csv_df.to_csv(csv_filename, index=False, encoding='utf-8', quoting=1)  # QUOTE_ALL

    print(f"\n🎉 Enhanced results saved successfully!")
    print(f"📁 JSON file: {json_filename}")
    print(f"📁 CSV file: {csv_filename}")
    print(f"📊 Total questions generated: {len(all_results)}")

    # Enhanced summary statistics
    domains = csv_df['domain'].value_counts()
    stages = csv_df['stage'].value_counts()
    question_types = csv_df['question_type'].value_counts()

    # Check answer distribution
    answer_dist = csv_df['answer'].value_counts()

    print(f"\n📈 Enhanced Summary Statistics:")
    print(f"Domains: {dict(domains)}")
    print(f"Stages: {dict(stages)}")
    print(f"Question Types: {dict(question_types)}")
    print(f"Answer Distribution: {dict(answer_dist)}")

    # Quality checks
    clean_answers = csv_df['answer'].str.match(r'^[A-D]$').sum()
    print(f"✅ Clean answers (A-D only): {clean_answers}/{len(csv_df)} ({clean_answers/len(csv_df)*100:.1f}%)")

    proper_options = csv_df['options'].str.contains(r'A\.|B\.|C\.|D\.').sum()
    print(f"✅ Properly formatted options: {proper_options}/{len(csv_df)} ({proper_options/len(csv_df)*100:.1f}%)")

else:
    print("❌ No questions were generated. Please check your input data and API key.")

print("\n🏁 Enhanced process completed with improved quality!")