In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 📚 GPU-Optimized Question Analysis Pipeline - FIXED Version
# ----------------------------------------------------
# Fixes: Better question extraction + Remove question numbers

# ✅ STEP 0: Install Required Libraries
!pip install transformers sentence-transformers pdfplumber torch torchvision torchaudio pytesseract Pillow pdf2image --quiet
!apt-get update && apt-get install -y tesseract-ocr poppler-utils

import pdfplumber
import pandas as pd
import re
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer, util
from IPython.display import display
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from collections import defaultdict
import hashlib

# ✅ GPU Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🚀 Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory // 1024**3} GB")

# ✅ STEP 1: FIXED Question Extraction
question_pdf_path = "/content/drive/MyDrive/Os question bank.pdf"

def extract_questions_fixed(pdf_path):
    """
    FIXED: Better question extraction for all formats
    """
    print("🔍 Starting FIXED question extraction...")

    # Try text extraction first
    questions_text = extract_questions_from_text_fixed(pdf_path)
    print(f"✅ Text extraction found: {len(questions_text)} questions")

    # If insufficient, try OCR
    if len(questions_text) < 50:
        print("🔍 Enhancing with OCR...")
        questions_ocr = extract_questions_from_ocr_fixed(pdf_path)
        print(f"✅ OCR extraction found: {len(questions_ocr)} questions")

        # Combine and deduplicate
        all_questions = questions_text + questions_ocr
        questions_text = remove_exact_duplicates(all_questions)

    print(f"✅ After deduplication: {len(questions_text)} unique questions")
    return questions_text

def extract_questions_from_text_fixed(pdf_path):
    """FIXED text extraction with better patterns"""
    questions = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    page_questions = process_page_text_fixed(text, page_num + 1)
                    questions.extend(page_questions)
                    print(f"📄 Page {page_num + 1}: Found {len(page_questions)} questions")
    except Exception as e:
        print(f"Text extraction error: {e}")

    return questions

def process_page_text_fixed(text, page_num):
    """FIXED: Comprehensive question detection"""
    questions = []

    # Split into lines and clean
    lines = [line.strip() for line in text.split('\n') if line.strip()]

    # Process each line
    for line in lines:
        # Skip headers and unwanted content
        if should_skip_line(line):
            continue

        # Extract question from line
        question = extract_question_from_line(line)
        if question:
            questions.append(question)

    return questions

def should_skip_line(line):
    """Check if line should be skipped"""
    skip_patterns = [
        'research methodology', 'intellectual property rights', 'operating system',
        'unit i:', 'unit ii:', 'unit iii:', 'unit iv:', 'unit v:',
        'unique questions', '80 unique', 'question bank', 'course code',
        'credits:', 'contact hours', 'page', 'syllabus', 'introduction',
        'method of data collection', 'research design'
    ]

    line_lower = line.lower()
    return any(pattern in line_lower for pattern in skip_patterns)

def extract_question_from_line(line):
    """FIXED: Extract clean question from line"""
    original_line = line.strip()

    # Remove question numbers at the beginning
    line = remove_question_numbers(line)

    # Check if it's a valid question
    if not is_valid_question_fixed(line):
        return None

    # Clean the question
    clean_question = clean_question_text_fixed(line)

    return clean_question

def remove_question_numbers(line):
    """FIXED: Remove question numbers from beginning"""
    # Patterns to remove question numbers
    patterns = [
        r'^\d+\.\s*',           # 1.
        r'^Q\d+\.\s*',          # Q1.
        r'^\d+\)\s*',           # 1)
        r'^\d+\s+',             # 1 (followed by space)
        r'^[IVX]+\.\s*',        # I. II. III.
        r'^\([a-z]\)\s*',       # (a) (b) (c)
        r'^\w+\s*:\s*',         # Unit I: etc
    ]

    cleaned_line = line
    for pattern in patterns:
        cleaned_line = re.sub(pattern, '', cleaned_line).strip()

    return cleaned_line

def is_valid_question_fixed(text):
    """FIXED: Better question validation"""
    if not text or len(text.strip()) < 5:
        return False

    text = text.strip()

    # Must end with question mark OR period
    if not (text.endswith('?') or text.endswith('.')):
        return False

    # Must have minimum words
    words = text.split()
    if len(words) < 3:
        return False

    # Skip if too long (likely paragraph)
    if len(words) > 25:
        return False

    # Check for question indicators
    question_indicators = [
        # Question words
        'what', 'how', 'why', 'when', 'where', 'which', 'who',
        # Command words
        'define', 'explain', 'describe', 'list', 'discuss', 'mention',
        'compare', 'differentiate', 'distinguish', 'analyze', 'examine',
        'illustrate', 'evaluate', 'assess', 'write', 'draw', 'demonstrate',
        'simulate', 'apply', 'implement', 'solve', 'calculate', 'translate',
        'design', 'create', 'build', 'develop', 'construct', 'model'
    ]

    first_words = ' '.join(words[:3]).lower()
    has_indicator = any(indicator in first_words for indicator in question_indicators)

    # For statements ending with period, they should have command words
    if text.endswith('.'):
        first_word = words[0].lower() if words else ""
        command_words = ['define', 'explain', 'describe', 'list', 'discuss', 'mention',
                        'compare', 'differentiate', 'distinguish', 'analyze', 'examine',
                        'illustrate', 'evaluate', 'assess', 'write', 'draw', 'demonstrate',
                        'simulate', 'apply', 'implement', 'solve', 'calculate', 'translate',
                        'design', 'create', 'build', 'develop', 'construct', 'model']

        if first_word in command_words:
            has_indicator = True

    return has_indicator

def clean_question_text_fixed(text):
    """FIXED: Clean question text"""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Ensure proper capitalization
    if text and text[0].islower():
        text = text[0].upper() + text[1:]

    # Ensure proper ending
    if text.endswith('?'):
        text = text.rstrip('?') + '?'
    elif text.endswith('.'):
        text = text.rstrip('.') + '.'

    # Remove any remaining artifacts
    text = re.sub(r'^[^\w\s]*', '', text)  # Remove leading non-word chars

    return text

def extract_questions_from_ocr_fixed(pdf_path):
    """FIXED OCR extraction"""
    questions = []

    try:
        images = convert_from_path(pdf_path, dpi=300)
        for i, image in enumerate(images):
            print(f"📄 OCR processing page {i+1}/{len(images)}")

            # Enhanced OCR configuration
            custom_config = r'--oem 3 --psm 6'
            ocr_text = pytesseract.image_to_string(image, config=custom_config)

            page_questions = process_page_text_fixed(ocr_text, i + 1)
            questions.extend(page_questions)
    except Exception as e:
        print(f"OCR extraction error: {e}")

    return questions

def remove_exact_duplicates(questions):
    """Remove exact duplicates while preserving order"""
    seen = set()
    unique_questions = []

    for question in questions:
        # Normalize for comparison
        normalized = question.lower().strip()
        normalized = re.sub(r'\s+', ' ', normalized)
        normalized = re.sub(r'[^\w\s]', '', normalized)

        if normalized and normalized not in seen and len(normalized) > 5:
            seen.add(normalized)
            unique_questions.append(question)

    return unique_questions

# Extract questions
questions = extract_questions_fixed(question_pdf_path)
print(f"\n📊 QUESTION EXTRACTION SUMMARY")
print(f"Total unique questions extracted: {len(questions)}")

if questions:
    print("\n📋 Sample questions:")
    for i, q in enumerate(questions[:10]):
        print(f"{i+1}. {q}")

# ✅ STEP 2: Topic Extraction (Already working well)
syllabus_pdf_path = "/content/drive/MyDrive/os syallabus.pdf"

def extract_topics_all_units_final(pdf_path):
    """Topic extraction - keeping existing working version"""
    print("\n🔍 Starting topic extraction for all units...")

    # Try text extraction first
    unit_topics_text = extract_topics_from_text_all_units_final(pdf_path)

    # If insufficient, try OCR
    total_topics = sum(len(topics) for topics in unit_topics_text.values())
    if total_topics < 20:
        print("🔍 Text extraction insufficient, trying OCR...")
        unit_topics_ocr = extract_topics_from_ocr_all_units_final(pdf_path)

        # Merge results
        for unit, topics in unit_topics_ocr.items():
            if unit in unit_topics_text:
                unit_topics_text[unit].extend(topics)
            else:
                unit_topics_text[unit] = topics

    # Clean and deduplicate
    cleaned_unit_topics = {}
    for unit, topics in unit_topics_text.items():
        cleaned_topics = clean_and_deduplicate_topics(topics)
        if cleaned_topics:
            cleaned_unit_topics[unit] = cleaned_topics

    return cleaned_unit_topics

def extract_topics_from_text_all_units_final(pdf_path):
    """Final text extraction for all units"""
    unit_topics = defaultdict(list)

    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = ""
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\n"

        if full_text.strip():
            unit_topics = parse_all_units_final(full_text)

    except Exception as e:
        print(f"Text extraction error for topics: {e}")

    return unit_topics

def extract_topics_from_ocr_all_units_final(pdf_path):
    """Final OCR extraction for all units"""
    unit_topics = defaultdict(list)

    try:
        images = convert_from_path(pdf_path, dpi=300)
        full_ocr_text = ""

        for i, image in enumerate(images):
            print(f"📄 OCR processing page {i+1}/{len(images)} for topics")

            ocr_text = pytesseract.image_to_string(image, lang='eng')
            full_ocr_text += ocr_text + "\n"

        if full_ocr_text.strip():
            unit_topics = parse_all_units_final(full_ocr_text)

    except Exception as e:
        print(f"OCR extraction error for topics: {e}")

    return unit_topics

def parse_all_units_final(text):
    """Final parsing to capture all units"""
    unit_topics = defaultdict(list)

    # Remove unwanted content first
    text = remove_unwanted_content_final(text)

    # Strategy 1: Standard unit parsing
    unit_patterns = [
        r'Unit\s+([IVX\d]+)[:\s]*\n(.*?)(?=Unit\s+[IVX\d]+|Course\s+Outcomes|Suggested\s+Learning|Text\s+Books|Reference\s+Books|$)',
        r'Unit\s+([IVX\d]+)[:\s]*(.*?)(?=Unit\s+[IVX\d]+|Course\s+Outcomes|Suggested\s+Learning|Text\s+Books|Reference\s+Books|$)',
        r'Unit\s+([IVX\d]+)\s*\n(.*?)(?=Unit\s+[IVX\d]+|Course\s+Outcomes|$)'
    ]

    # Try each pattern
    for pattern_idx, pattern in enumerate(unit_patterns):
        unit_matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)

        if unit_matches and len(unit_matches) >= 2:
            print(f"✅ Using pattern {pattern_idx + 1}: Found {len(unit_matches)} units")

            for unit_num, unit_content in unit_matches:
                unit_key = f"Unit {unit_num}"
                topics = extract_topics_from_unit_content_final(unit_content)

                if topics:
                    unit_topics[unit_key].extend(topics)
                    print(f"  {unit_key}: {len(topics)} topics")

            if len(unit_topics) >= 2:
                break

    # Strategy 2: If standard parsing failed, try line-by-line approach
    if len(unit_topics) < 2:
        print("🔍 Trying alternative unit detection...")
        unit_topics = parse_units_line_by_line(text)

    return dict(unit_topics)

def parse_units_line_by_line(text):
    """Alternative line-by-line parsing approach"""
    unit_topics = defaultdict(list)
    current_unit = None
    current_content = []

    lines = text.split('\n')

    for line in lines:
        line = line.strip()

        # Check if line is a unit header
        unit_match = re.match(r'Unit\s+([IVX\d]+)', line, re.IGNORECASE)
        if unit_match:
            # Save previous unit content
            if current_unit and current_content:
                content_text = '\n'.join(current_content)
                topics = extract_topics_from_unit_content_final(content_text)
                if topics:
                    unit_topics[current_unit].extend(topics)
                    print(f"  {current_unit}: {len(topics)} topics")

            # Start new unit
            current_unit = f"Unit {unit_match.group(1)}"
            current_content = []

        # Check if we've reached end sections
        elif any(end_marker in line for end_marker in ['Course Outcomes', 'Suggested Learning',
                                                      'Text Books', 'Reference Books']):
            # Save final unit content
            if current_unit and current_content:
                content_text = '\n'.join(current_content)
                topics = extract_topics_from_unit_content_final(content_text)
                if topics:
                    unit_topics[current_unit].extend(topics)
                    print(f"  {current_unit}: {len(topics)} topics")
            break

        # Add line to current unit content
        elif current_unit and line:
            current_content.append(line)

    # Don't forget the last unit
    if current_unit and current_content:
        content_text = '\n'.join(current_content)
        topics = extract_topics_from_unit_content_final(content_text)
        if topics:
            unit_topics[current_unit].extend(topics)
            print(f"  {current_unit}: {len(topics)} topics")

    return dict(unit_topics)

def remove_unwanted_content_final(text):
    """Remove unwanted content"""
    unwanted_patterns = [
        r'Pedagogy[:/].*?(?=\n\n|\nUnit|\n[A-Z])',
        r'Links[:/].*?(?=\n\n|\nUnit|\n[A-Z])',
        r'Impartus.*?(?=\n\n|\nUnit|\n[A-Z])',
        r'https?://[^\s]+',
        r'Course\s+Code[:/].*?(?=\n)',
        r'Credits[:/].*?(?=\n)',
        r'Contact\s+Hours[:/].*?(?=\n)',
        r'Course\s+Coordinator[:/].*?(?=\n)',
        r'Prerequisites?[:/].*?(?=\n)',
        r'Pre\s*–\s*requisites[:/].*?(?=\n)',
    ]

    cleaned_text = text
    for pattern in unwanted_patterns:
        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE | re.DOTALL)

    return cleaned_text

def extract_topics_from_unit_content_final(content):
    """Extract topics from unit content"""
    topics = []

    # Multiple splitting strategies
    split_patterns = [
        r'[;\n]',           # Semicolon or newline
        r'[,\n]',           # Comma or newline
        r'[\n]'             # Just newline
    ]

    for pattern in split_patterns:
        potential_topics = re.split(pattern, content)
        temp_topics = []

        for topic in potential_topics:
            topic = topic.strip()

            # Clean the topic
            topic = re.sub(r'^[•\-\*\d\.\s()]+', '', topic)
            topic = re.sub(r'^\w+\s*:', '', topic)
            topic = re.sub(r'\s+', ' ', topic).strip()

            # Quality checks
            if is_valid_topic_final(topic):
                temp_topics.append(topic)

        # Use the pattern that gives most topics
        if len(temp_topics) > len(topics):
            topics = temp_topics

    return topics

def is_valid_topic_final(topic):
    """Topic validation"""
    if not topic or len(topic) < 4:
        return False

    # Length checks
    if len(topic) > 120 or len(topic) < 4:
        return False

    # Skip all caps
    if topic.isupper() and len(topic) > 15:
        return False

    # Unwanted keywords
    unwanted_keywords = [
        'pedagogy', 'delivery', 'tools', 'chalk', 'talk', 'powerpoint',
        'video', 'link', 'nptel', 'impartus', 'recording', 'course code',
        'credit', 'contact hour', 'coordinator', 'prerequisite'
    ]

    topic_lower = topic.lower()
    if any(keyword in topic_lower for keyword in unwanted_keywords):
        return False

    # Must have reasonable word count
    word_count = len(topic.split())
    if word_count < 2 or word_count > 15:
        return False

    return True

def clean_and_deduplicate_topics(topics):
    """Clean and deduplicate topics"""
    if not topics:
        return []

    cleaned_topics = []
    seen_normalized = set()

    for topic in topics:
        # Normalize for comparison
        normalized = topic.lower().strip()
        normalized = re.sub(r'\s+', ' ', normalized)
        normalized = re.sub(r'[^\w\s]', '', normalized)

        if normalized and normalized not in seen_normalized and len(normalized) > 3:
            seen_normalized.add(normalized)
            cleaned_topics.append(topic.strip())

    return cleaned_topics

# Extract topics by units
unit_topics = extract_topics_all_units_final(syllabus_pdf_path)

print(f"\n📊 TOPIC EXTRACTION SUMMARY")
total_topics = sum(len(topics) for topics in unit_topics.values())
print(f"Total units found: {len(unit_topics)}")
print(f"Total topics extracted: {total_topics}")

for unit, topics in unit_topics.items():
    print(f"\n📚 {unit}: {len(topics)} topics")
    for i, topic in enumerate(topics[:5]):
        print(f"  {i+1}. {topic}")
    if len(topics) > 5:
        print(f"  ... and {len(topics)-5} more")

# ✅ STEP 3: GPU-Optimized RoBERTa Model (Keep existing)
class RobertaMultiTask(nn.Module):
    def __init__(self):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(self.roberta.config.hidden_size, 1)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, 6)

    def forward(self, input_ids, attention_mask):
        output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = self.dropout(output.last_hidden_state[:, 0, :])
        mark_pred = self.regressor(cls_output).squeeze(-1)
        bloom_pred = self.classifier(cls_output)
        return mark_pred, bloom_pred

# Load model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model_path = "/content/drive/MyDrive/roberta_multitask_model_full.pt"

try:
    from torch.serialization import add_safe_globals
    add_safe_globals({'RobertaMultiTask': RobertaMultiTask})
    model = torch.load(model_path, map_location=device, weights_only=False)
    model = model.to(device)
    model.eval()
    print(f"✅ RoBERTa model loaded on {device}")
except Exception as e:
    print(f"⚠️ Model loading failed: {e}")
    print("Using dummy predictor for demonstration...")
    model = None

# ✅ STEP 4: Prediction Functions (Keep existing)
bloom_map = {0: "L1", 1: "L2", 2: "L3", 3: "L4", 4: "L5", 5: "L6"}

def predict_question_gpu(question):
    """GPU-optimized prediction with integer marks and minimum 3"""
    if model is None:
        import random
        marks = max(3, random.randint(3, 8))  # Integer marks, minimum 3
        bloom = random.choice(list(bloom_map.values()))
        return marks, bloom

    inputs = tokenizer(question, return_tensors="pt", padding='max_length', truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        mark_pred, bloom_pred = model(inputs['input_ids'], inputs['attention_mask'])
        marks = max(3, round(mark_pred.cpu().item()))  # Integer marks, minimum 3
        bloom = bloom_map[torch.argmax(bloom_pred).cpu().item()]
    return marks, bloom

def predict_batch_gpu(questions, batch_size=32):
    """GPU-optimized batch prediction with integer marks and minimum 3"""
    if model is None:
        import random
        results = []
        for _ in questions:
            marks = max(3, random.randint(3, 8))  # Integer marks, minimum 3
            bloom = random.choice(list(bloom_map.values()))
            results.append((marks, bloom))
        return results

    results = []
    for i in range(0, len(questions), batch_size):
        batch = questions[i:i+batch_size]

        inputs = tokenizer(batch, return_tensors="pt", padding='max_length',
                          truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            mark_preds, bloom_preds = model(inputs['input_ids'], inputs['attention_mask'])
            marks = [max(3, round(mark.cpu().item())) for mark in mark_preds]  # Integer marks, minimum 3
            blooms = [bloom_map[torch.argmax(bloom_pred).cpu().item()] for bloom_pred in bloom_preds]

        results.extend(list(zip(marks, blooms)))

    return results

def get_difficulty(marks):
    if marks <= 3:
        return "Easy"
    elif marks <= 6:
        return "Medium"
    else:
        return "Hard"

# ✅ STEP 5: Topic Matching (Keep existing)
all_topics = []
topic_to_unit = {}

for unit, topics in unit_topics.items():
    for topic in topics:
        all_topics.append(topic)
        topic_to_unit[topic] = unit

print(f"\n🔄 Preparing GPU-optimized topic matching for {len(all_topics)} topics...")

if len(all_topics) > 0:
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    topic_embeddings = sbert_model.encode(all_topics, convert_to_tensor=True, device=device)

    def match_topics_batch_gpu(questions, batch_size=32):
        """GPU-optimized batch topic matching"""
        results = []
        for i in range(0, len(questions), batch_size):
            batch = questions[i:i+batch_size]

            q_embeddings = sbert_model.encode(batch, convert_to_tensor=True, device=device)
            scores = util.cos_sim(q_embeddings, topic_embeddings)

            for j, question_scores in enumerate(scores):
                best_topic_idx = torch.argmax(question_scores).cpu().item()
                similarity_score = question_scores[best_topic_idx].cpu().item()

                best_topic = all_topics[best_topic_idx]
                unit = topic_to_unit[best_topic]

                results.append((best_topic, unit, similarity_score))

        return results
else:
    def match_topics_batch_gpu(questions, batch_size=32):
        return [("No topics available", "Unknown Unit", 0.0) for _ in questions]

# ✅ STEP 6: Process Questions
if len(questions) > 0:
    print(f"\n🚀 Processing {len(questions)} questions...")

    batch_size = 32 if torch.cuda.is_available() else 16

    print("🔄 Running batch predictions...")
    prediction_results = predict_batch_gpu(questions, batch_size)

    print("🔄 Running batch topic matching...")
    topic_results = match_topics_batch_gpu(questions, batch_size)

    # Combine results
    results = []
    for i, (question, (marks, bloom), (matched_topic, unit, similarity)) in enumerate(zip(questions, prediction_results, topic_results)):
        difficulty = get_difficulty(marks)

        results.append({
            "question_id": i+1,
            "question": question,
            "predicted_marks": marks,
            "bloom_level": bloom,
            "difficulty": difficulty,
            "matched_topic": matched_topic,
            "matched_unit": unit,
            "topic_similarity": round(similarity, 3)
        })

    # ✅ STEP 7: Save Results
    if results:
        # Save main results
        df = pd.DataFrame(results)
        output_file = "FIXED_processed_questions.csv"
        df.to_csv(output_file, index=False)
        print(f"\n✅ Saved {len(results)} processed questions to '{output_file}'")

        # Save unit-wise topics
        unit_topics_data = []
        for unit, topics in unit_topics.items():
            for i, topic in enumerate(topics, 1):
                unit_topics_data.append({
                    "unit": unit,
                    "topic_id": i,
                    "topic": topic
                })

        if unit_topics_data:
            topics_df = pd.DataFrame(unit_topics_data)
            topics_file = "FIXED_unit_wise_topics.csv"
            topics_df.to_csv(topics_file, index=False)
            print(f"✅ Saved {len(unit_topics_data)} topics to '{topics_file}'")

        # Display results
        print(f"\n📊 FIXED PROCESSING SUMMARY")
        print("=" * 60)
        print(f"Device used: {device}")
        print(f"Questions extracted: {len(questions)}")
        print(f"Questions processed: {len(results)}")
        print(f"Units found: {len(unit_topics)}")
        print(f"Total topics: {len(all_topics)}")
        print(f"Success rate: {len(results)/len(questions)*100:.1f}%")

        print(f"\n📋 SAMPLE RESULTS:")
        display(df.head(10))

        print(f"\n📚 UNIT-WISE TOPIC DISTRIBUTION:")
        if unit_topics_data:
            unit_counts = topics_df['unit'].value_counts()
            for unit, count in unit_counts.items():
                print(f"{unit}: {count} topics")

        print(f"\n📈 QUESTION STATISTICS:")
        print("Difficulty Distribution:")
        print(df['difficulty'].value_counts())
        print("\nBloom Level Distribution:")
        print(df['bloom_level'].value_counts())

        # Performance metrics
        if torch.cuda.is_available():
            print(f"\n🚀 GPU PERFORMANCE:")
            print(f"GPU Memory Used: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
            print(f"GPU Memory Cached: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

    else:
        print("❌ No questions were successfully processed")

else:
    print("❌ No questions found to process")

print(f"\n🎯 FIXED Process completed! Check the generated CSV files for results.")
print(f"📁 Output files:")
print(f"  - FIXED_processed_questions.csv (main results)")
print(f"  - FIXED_unit_wise_topics.csv (unit-wise topics)")

# ✅ STEP 8: Memory cleanup
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("🧹 GPU memory cleaned up")

print("\n" + "="*80)
print("🔧 KEY FIXES IMPLEMENTED:")
print("="*80)
print("1. ✅ FIXED Question Number Removal:")
print("   - Removes '1.', 'Q1.', '1)', etc. from questions")
print("   - Clean questions without numbering artifacts")
print("")
print("2. ✅ ENHANCED Question Detection:")
print("   - Better patterns for both '?' and '.' endings")
print("   - Improved validation for command-style questions")
print("   - More comprehensive question indicators")
print("")
print("3. ✅ IMPROVED Text Processing:")
print("   - Better line filtering to skip headers/metadata")
print("   - Enhanced cleaning of question text")
print("   - Reduced false positives")
print("")
print("4. ✅ ROBUST Extraction Pipeline:")
print("   - Multiple extraction strategies")
print("   - Better OCR fallback")
print("   - Improved deduplication")
print("")
print("5. ✅ INTEGER MARKS with MINIMUM 3:")
print("   - All marks stored as integers (3, 4, 5, 6, 7, 8)")
print("   - Minimum mark value enforced to be 3")
print("   - No decimal marks or marks below 3")
print("="*80)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

⚠️ Model loading failed: [Errno 2] No such file or directory: '/content/drive/MyDrive/roberta_multitask_model_full.pt'
Using dummy predictor for demonstration...

🔄 Preparing GPU-optimized topic matching for 0 topics...
❌ No questions found to process

🎯 FIXED Process completed! Check the generated CSV files for results.
📁 Output files:
  - FIXED_processed_questions.csv (main results)
  - FIXED_unit_wise_topics.csv (unit-wise topics)
🧹 GPU memory cleaned up

🔧 KEY FIXES IMPLEMENTED:
1. ✅ FIXED Question Number Removal:
   - Removes '1.', 'Q1.', '1)', etc. from questions
   - Clean questions without numbering artifacts

2. ✅ ENHANCED Question Detection:
   - Better patterns for both '?' and '.' endings
   - Improved validation for command-style questions
   - More comprehensive question indicators

3. ✅ IMPROVED Text Processing:
   - Better line filtering to skip headers/metadata
   - Enhanced cleaning of question text
   - Reduced false positives

4. ✅ ROBUST Extraction Pipeline:
   - M

In [None]:
!pip install pyngrok



In [None]:
# Fixed Google Colab Integration Code - Port Conflict Resolution
# Replace your existing Colab code with this

import requests
import json
import base64
from io import BytesIO
import pandas as pd
from pyngrok import ngrok
from flask import Flask, request, jsonify
import threading
import time
import os

# Set your ngrok auth token
ngrok.set_auth_token("2yE2JW29TGeZBmN0AIa5hSSMpdj_7cR52txvwnkCM3prHtZsR")

# ✅ FIXED: Configuration
# Since your Node.js server is on port 5000, we'll use a different port for Colab
COLAB_PORT = 8000  # Different port to avoid conflict
BACKEND_URL = "https://068e-49-204-87-250.ngrok-free.app"  # ✅ Use your backend's ngrok URL
API_KEY = "81620796872873ca49ef90a37ac274bb62343c70a21f2045ad5a43b3ff9d9eb9"

# Flask app for receiving webhook requests
app = Flask(__name__)

def send_results_to_backend(process_id, questions, topics, status="success", error_message=None):
    """
    Send processing results back to your Node.js backend
    ✅ FIXED: Use the correct backend URL
    """
    try:
        # ✅ FIXED: Use your actual backend ngrok URL
        callback_url = f"{BACKEND_URL}/api/upload/processing-complete"

        print(f"📤 Sending results to backend...")
        print(f"🔗 URL: {callback_url}")
        print(f"🔑 Process ID: {process_id}")
        print(f"📊 Questions: {len(questions) if questions else 0}")
        print(f"📋 Topics: {len(topics) if topics else 0}")
        print(f"✅ Status: {status}")

        # Prepare payload
        payload = {
            'processId': process_id,
            'status': status
        }

        if status == 'success':
            payload['questions'] = questions
            payload['topics'] = topics
        else:
            payload['error'] = error_message or 'Processing failed'

        # ✅ FIXED: Use correct header format
        headers = {
            'Content-Type': 'application/json',
            'x-api-key': API_KEY,
            'User-Agent': 'GoogleColab/1.0',
            'ngrok-skip-browser-warning': 'true'  # Skip ngrok browser warning
        }

        print(f"📡 Making request to: {callback_url}")
        print(f"🔑 Using API key: {API_KEY[:10]}...")

        # Send the request
        response = requests.post(
            callback_url,
            json=payload,
            headers=headers,
            timeout=30
        )

        print(f"📨 Response Status: {response.status_code}")
        print(f"📨 Response Text: {response.text}")

        if response.status_code == 200:
            print("✅ Results sent successfully to backend!")
            return True
        else:
            print(f"❌ Backend returned error: {response.status_code}")
            print(f"❌ Error details: {response.text}")
            return False

    except requests.exceptions.ConnectionError as e:
        print(f"❌ Connection error - check your backend URL: {str(e)}")
        return False
    except requests.exceptions.Timeout as e:
        print(f"❌ Timeout error: {str(e)}")
        return False
    except Exception as e:
        print(f"❌ Error sending results to backend: {str(e)}")
        return False

def test_backend_connection():
    """
    Test if we can reach the backend server
    """
    try:
        print("🧪 Testing backend connection...")

        # Try to reach the health endpoint
        test_url = f"{BACKEND_URL}/health"

        headers = {
            'Content-Type': 'application/json',
            'User-Agent': 'GoogleColab/1.0',
            'ngrok-skip-browser-warning': 'true'
        }

        response = requests.get(test_url, headers=headers, timeout=10)

        if response.status_code in [200, 404]:  # 404 is fine, means server is running
            print("✅ Backend is reachable!")
            return True
        else:
            print(f"⚠️ Backend returned: {response.status_code}")
            return True  # Still consider it reachable

    except requests.exceptions.ConnectionError:
        print("❌ Cannot reach backend - check the BACKEND_URL!")
        print(f"❌ Current backend URL: {BACKEND_URL}")
        return False
    except Exception as e:
        print(f"⚠️ Backend test inconclusive: {str(e)}")
        return True  # Assume it's fine

@app.route('/process-documents', methods=['POST'])
def process_documents():
    """
    Main webhook endpoint to receive processing requests from your backend
    """
    try:
        print("📥 Received processing request!")

        data = request.get_json()
        if not data:
            print("❌ No JSON data received")
            return jsonify({'error': 'No JSON data'}), 400

        print(f"📋 Request data keys: {list(data.keys())}")

        # Extract required data
        process_id = data.get('processId')
        question_bank_base64 = data.get('questionBank')
        syllabus_base64 = data.get('syllabus')
        callback_url = data.get('callbackUrl')  # This might not be used since we use our own

        print(f"🔑 Process ID: {process_id}")
        print(f"📄 Question bank size: {len(question_bank_base64) if question_bank_base64 else 0} chars")
        print(f"📋 Syllabus size: {len(syllabus_base64) if syllabus_base64 else 0} chars")

        if not all([process_id, question_bank_base64, syllabus_base64]):
            error_msg = "Missing required data: processId, questionBank, or syllabus"
            print(f"❌ {error_msg}")
            return jsonify({'error': error_msg}), 400

        # Start processing in background thread
        def process_async():
            try:
                print(f"🔄 Starting async processing for {process_id}")

                # Save temporary files
                question_bank_path = f'/tmp/question_bank_{process_id}.pdf'
                syllabus_path = f'/tmp/syllabus_{process_id}.pdf'

                # Decode and save files
                with open(question_bank_path, 'wb') as f:
                    f.write(base64.b64decode(question_bank_base64))

                with open(syllabus_path, 'wb') as f:
                    f.write(base64.b64decode(syllabus_base64))

                print("📁 Temporary files saved")

                # ✅ Use your existing processing functions
                print("📄 Extracting questions...")
                questions = extract_questions_fixed(question_bank_path)

                print("📋 Extracting topics...")
                unit_topics = extract_topics_all_units_final(syllabus_path)

                if len(questions) > 0:
                    print(f"✅ Found {len(questions)} questions")

                    # ✅ Use your existing ML processing
                    print("🤖 Processing with ML models...")
                    prediction_results = predict_batch_gpu(questions, batch_size=32)
                    topic_results = match_topics_batch_gpu(questions, batch_size=32)

                    # Format results
                    processed_questions = []
                    for i, (question, (marks, bloom), (matched_topic, unit, similarity)) in enumerate(
                        zip(questions, prediction_results, topic_results)
                    ):
                        difficulty = get_difficulty(marks)

                        processed_questions.append({
                            "question_id": i + 1,
                            "question": question,
                            "predicted_marks": int(marks),
                            "bloom_level": bloom,
                            "difficulty": difficulty,
                            "matched_topic": matched_topic,
                            "matched_unit": unit,
                            "topic_similarity": round(float(similarity), 3)
                        })

                    # Format topics
                    processed_topics = []
                    topic_id = 1
                    for unit, topics in unit_topics.items():
                        for topic in topics:
                            processed_topics.append({
                                "unit": unit,
                                "topic_id": topic_id,
                                "topic": topic
                            })
                            topic_id += 1

                    print(f"✅ Processing completed: {len(processed_questions)} questions, {len(processed_topics)} topics")

                    # Send results to backend
                    success = send_results_to_backend(
                        process_id,
                        processed_questions,
                        processed_topics,
                        'success'
                    )

                    if not success:
                        print("❌ Failed to send results")
                else:
                    print("❌ No questions extracted")
                    send_results_to_backend(process_id, [], [], 'failed', 'No questions extracted')

                # Cleanup
                try:
                    os.remove(question_bank_path)
                    os.remove(syllabus_path)
                    print("🗑️ Cleaned up temporary files")
                except:
                    pass

            except Exception as e:
                print(f"❌ Processing error: {str(e)}")
                import traceback
                traceback.print_exc()

                # Send failure status
                send_results_to_backend(process_id, [], [], 'failed', str(e))

        # Start processing in background
        thread = threading.Thread(target=process_async)
        thread.daemon = True
        thread.start()

        return jsonify({
            'status': 'processing_started',
            'processId': process_id,
            'message': 'Document processing started successfully'
        }), 200

    except Exception as e:
        print(f"❌ Webhook error: {str(e)}")
        import traceback
        traceback.print_exc()
        return jsonify({'error': str(e)}), 500

@app.route('/health', methods=['GET'])
def health_check():
    """Health check endpoint"""
    return jsonify({
        "status": "healthy",
        "message": "Colab processing server is running",
        "backend_url": BACKEND_URL,
        "colab_port": COLAB_PORT,
        "timestamp": time.time()
    })

@app.route('/test', methods=['GET', 'POST'])
def test_endpoint():
    """Test endpoint for debugging"""
    return jsonify({
        "message": "Colab server is working",
        "backend_url": BACKEND_URL,
        "method": request.method,
        "colab_port": COLAB_PORT,
        "timestamp": time.time()
    })

def start_colab_server():
    """
    Start the Colab server with ngrok tunnel
    ✅ FIXED: Use different port to avoid conflict
    """
    try:
        print("🚀 Setting up Google Colab integration...")
        print(f"🔧 Using port {COLAB_PORT} for Colab server")
        print(f"🔗 Backend URL: {BACKEND_URL}")

        # Test backend connection
        backend_reachable = test_backend_connection()
        if not backend_reachable:
            print("⚠️  WARNING: Backend server not reachable!")
            print("⚠️  Make sure your Node.js server is running and accessible")
            print(f"⚠️  Expected backend URL: {BACKEND_URL}")

        # Kill any existing ngrok tunnels
        try:
            ngrok.kill()
        except:
            pass

        print(f"🔗 Starting ngrok tunnel on port {COLAB_PORT}...")

        # ✅ FIXED: Use different port for ngrok
        public_url = ngrok.connect(COLAB_PORT)

        print(f"✅ Webhook endpoint created: {public_url}")
        print(f"📝 Update your backend .env file with:")
        print(f"   COLAB_WEBHOOK_URL={public_url}/process-documents")

        # ✅ FIXED: Start Flask app on different port
        def run_flask():
            app.run(host='0.0.0.0', port=COLAB_PORT, debug=False, use_reloader=False)

        flask_thread = threading.Thread(target=run_flask)
        flask_thread.daemon = True
        flask_thread.start()

        # Wait a moment for Flask to start
        time.sleep(3)

        print("🎉 Colab integration setup completed!")
        print("💡 Your Colab notebook is now ready to receive processing requests")
        print("🔄 Webhook is running and waiting for requests...")
        print(f"🔗 Webhook URL: {public_url}/process-documents")
        print(f"🔗 Health check: {public_url}/health")

        # Keep the main thread alive and show status
        try:
            while True:
                time.sleep(30)
                print("📡 Webhook still active and listening for requests...")

        except KeyboardInterrupt:
            print("\n🛑 Shutting down...")
            ngrok.kill()

    except Exception as e:
        print(f"❌ Setup error: {str(e)}")
        import traceback
        traceback.print_exc()

def test_backend_callback():
    """
    Test sending a callback to the backend
    """
    try:
        print("🧪 Testing backend callback...")

        test_payload = {
            'processId': 'test_123',
            'status': 'success',
            'questions': [
                {
                    "question_id": 1,
                    "question": "Test question",
                    "predicted_marks": 5,
                    "bloom_level": "L2",
                    "difficulty": "Medium",
                    "matched_topic": "Test Topic",
                    "matched_unit": "Unit 1",
                    "topic_similarity": 0.95
                }
            ],
            'topics': [
                {
                    "unit": "Unit 1",
                    "topic_id": 1,
                    "topic": "Test Topic"
                }
            ]
        }

        callback_url = f"{BACKEND_URL}/api/upload/processing-complete"

        headers = {
            'Content-Type': 'application/json',
            'x-api-key': API_KEY,
            'User-Agent': 'GoogleColab/1.0',
            'ngrok-skip-browser-warning': 'true'
        }

        response = requests.post(callback_url, json=test_payload, headers=headers, timeout=10)

        print(f"🧪 Test callback response: {response.status_code}")
        print(f"🧪 Response text: {response.text}")

        return response.status_code == 200

    except Exception as e:
        print(f"🧪 Test callback failed: {str(e)}")
        return False

# Main execution
if __name__ == "__main__":
    print("🔧 Google Colab Document Processing Setup")
    print("=" * 50)
    print(f"🔗 Backend URL: {BACKEND_URL}")
    print(f"🔧 Colab Port: {COLAB_PORT}")
    print(f"🔑 API Key: {API_KEY[:10]}...")
    print("")

    print("⚠️  IMPORTANT: Update the BACKEND_URL above to your actual backend URL")
    print("⚠️  If your Node.js server is running locally, you need to expose it via ngrok")
    print("")

    # Test backend callback
    print("🧪 Testing backend callback...")
    test_backend_callback()

    print("")
    print("⚠️  SETUP INSTRUCTIONS:")
    print("1. Make sure BACKEND_URL points to your actual backend")
    print("2. Copy the ngrok URL that will be displayed below")
    print("3. Update your backend .env file with the COLAB_WEBHOOK_URL")
    print("")

    # Start the server
    start_colab_server()

🔧 Google Colab Document Processing Setup
🔗 Backend URL: https://068e-49-204-87-250.ngrok-free.app
🔧 Colab Port: 8000
🔑 API Key: 8162079687...

⚠️  IMPORTANT: Update the BACKEND_URL above to your actual backend URL
⚠️  If your Node.js server is running locally, you need to expose it via ngrok

🧪 Testing backend callback...
🧪 Testing backend callback...
🧪 Test callback response: 500
🧪 Response text: {"message":"Failed to update processing status"}

⚠️  SETUP INSTRUCTIONS:
1. Make sure BACKEND_URL points to your actual backend
2. Copy the ngrok URL that will be displayed below
3. Update your backend .env file with the COLAB_WEBHOOK_URL

🚀 Setting up Google Colab integration...
🔧 Using port 8000 for Colab server
🔗 Backend URL: https://068e-49-204-87-250.ngrok-free.app
🧪 Testing backend connection...
✅ Backend is reachable!
🔗 Starting ngrok tunnel on port 8000...
✅ Webhook endpoint created: NgrokTunnel: "https://55f5-34-59-96-172.ngrok-free.app" -> "http://localhost:8000"
📝 Update your back

Address already in use
Port 8000 is in use by another program. Either identify and stop that program, or start the server with a different port.


🎉 Colab integration setup completed!
💡 Your Colab notebook is now ready to receive processing requests
🔄 Webhook is running and waiting for requests...
🔗 Webhook URL: NgrokTunnel: "https://55f5-34-59-96-172.ngrok-free.app" -> "http://localhost:8000"/process-documents
🔗 Health check: NgrokTunnel: "https://55f5-34-59-96-172.ngrok-free.app" -> "http://localhost:8000"/health
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📥 Received processing request!


INFO:werkzeug:127.0.0.1 - - [10/Jun/2025 09:46:32] "POST /process-documents HTTP/1.1" 200 -


📋 Request data keys: ['processId', 'questionBank', 'syllabus', 'callbackUrl']
🔑 Process ID: 6847b19f5abf25eb87e91aab
📄 Question bank size: 11652 chars
📋 Syllabus size: 1335780 chars
🔄 Starting async processing for 6847b19f5abf25eb87e91aab
📁 Temporary files saved
📄 Extracting questions...
🔍 Starting FIXED question extraction...
📄 Page 1: Found 24 questions
📄 Page 2: Found 26 questions
📄 Page 3: Found 19 questions
📄 Page 4: Found 24 questions
📄 Page 5: Found 27 questions
📄 Page 6: Found 25 questions
📄 Page 7: Found 28 questions




📄 Page 8: Found 5 questions
✅ Text extraction found: 178 questions
✅ After deduplication: 178 unique questions
📋 Extracting topics...

🔍 Starting topic extraction for all units...




✅ Using pattern 1: Found 5 units
  Unit I: 7 topics
  Unit II: 8 topics
  Unit III: 11 topics
  Unit IV: 8 topics
  Unit V: 10 topics
✅ Found 178 questions
🤖 Processing with ML models...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
✅ Processing completed: 178 questions, 44 topics
📤 Sending results to backend...
🔗 URL: https://068e-49-204-87-250.ngrok-free.app/api/upload/processing-complete
🔑 Process ID: 6847b19f5abf25eb87e91aab
📊 Questions: 178
📋 Topics: 44
✅ Status: success
📡 Making request to: https://068e-49-204-87-250.ngrok-free.app/api/upload/processing-complete
🔑 Using API key: 8162079687...
📨 Response Status: 200
📨 Response Text: {"message":"Status updated successfully"}
✅ Results sent successfully to backend!
🗑️ Cleaned up temporary files
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active 

Traceback (most recent call last):
  File "<ipython-input-4-2644ac5518a8>", line 136, in process_documents
    data = request.get_json()
           ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/wrappers/request.py", line 608, in get_json
    data = self.get_data(cache=cache)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/wrappers/request.py", line 422, in get_data
    rv = self.stream.read()
         ^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/wsgi.py", line 577, in readall
    data = self.read(1024 * 64)
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/wsgi.py", line 562, in readinto
    self.on_disconnect()
  File "/usr/local/lib/python3.11/dist-packages/werkzeug/wsgi.py", line 499, in on_disconnect
    raise ClientDisconnected()
werkzeug.exceptions.ClientDisconnected: 400 Bad Request: The browser (or proxy) sent a request that this ser

❌ Webhook error: 400 Bad Request: The browser (or proxy) sent a request that this server could not understand.
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for requests...
📡 Webhook still active and listening for 