In [41]:
# -*- coding: utf-8 -*-
"""
MOSDAC-IntelliBot: Hackathon Project - Step 2 (Unified Corpus Creation) - Typo Fix

This script consolidates all collected data (Markdown, JSON FAQs, JSON Tables, JSON Links, PDF text)
into a single, consistent JSON array ('unified_corpus.json').
"""

# Dependencies and Environment Setup (as provided previously, ensure it's in a cell above this one)
# !pip install PyPDF2 nest_asyncio -q
# import nest_asyncio
# import json
# import os
# import re
# from typing import List, Dict, Union
# import PyPDF2

# --- Configuration (Matching your collection script's OUTPUT_DIR) ---
INPUT_BASE_DIR = "/content/drive/MyDrive/extracted_content"
OUTPUT_UNIFIED_FILE = os.path.join(INPUT_BASE_DIR, "unified_corpus.json")

# Define paths to your specific collected data types
UNSTRUCTURED_MD_DIR = os.path.join(INPUT_BASE_DIR, "unstructured_markdown") # Corrected variable name
STRUCTURED_JSON_DIR = os.path.join(INPUT_BASE_DIR, "structured_json")
EXTRACTED_TABLES_DIR = os.path.join(INPUT_BASE_DIR, "extracted_tables")
EXTRACTED_LINKS_DIR = os.path.join(INPUT_BASE_DIR, "extracted_links")
PDFS_DIR = os.path.join(INPUT_BASE_DIR, "pdfs")
DEBUG_HTML_DIR = os.path.join(INPUT_BASE_DIR, "debug_html")

# Create necessary output directories if they don't exist (from collection script)
os.makedirs(UNSTRUCTURED_MD_DIR, exist_ok=True)
os.makedirs(STRUCTURED_JSON_DIR, exist_ok=True)
os.makedirs(EXTRACTED_TABLES_DIR, exist_ok=True)
os.makedirs(EXTRACTED_LINKS_DIR, exist_ok=True)
os.makedirs(PDFS_DIR, exist_ok=True)
os.makedirs(DEBUG_HTML_DIR, exist_ok=True)


print("Dependencies installed.")
print(f"Unified corpus will be saved to: {OUTPUT_UNIFIED_FILE}")

# Helper Function for PDF Text Extraction (as provided previously, ensure it's in a cell above this one)
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a PDF file.
    Returns an empty string if the PDF is not readable or no text is found.
    """
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except PyPDF2.errors.PdfReadError:
        print(f"  ⚠️ Warning: Could not read PDF file (corrupted/encrypted?): {pdf_path}")
    except Exception as e:
        print(f"  ❌ Error extracting text from PDF {pdf_path}: {e}")
    return text.strip()

print("PDF extraction helper function defined.")

# Main Processing Logic for Unified Corpus Creation
async def create_unified_corpus():
    print(f"\n--- Starting Unified Corpus Creation ---")
    print(f"Reading data from: {INPUT_BASE_DIR}")

    unified_documents = []
    doc_id_counter = 0

    # --- 1. Process Unstructured Markdown Files (.md) ---
    # Corrected variable name here: UNSTRUCTURED_MD_DIR
    print(f"\nProcessing Unstructured Markdown files from: {UNSTRUCTURED_MD_DIR}")
    if not os.path.exists(UNSTRUCTURED_MD_DIR):
        print(f"  ⚠️ Warning: Markdown directory '{UNSTRUCTURED_MD_DIR}' does not exist. Skipping.")
    else:
        for filename in os.listdir(UNSTRUCTURED_MD_DIR):
            if filename.endswith(".md"):
                filepath = os.path.join(UNSTRUCTURED_MD_DIR, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        content = f.read()

                    if content.strip():
                        doc_id_counter += 1
                        source_url_from_filename = "https://www.mosdac.gov.in/unknown_md_source"
                        match = re.match(r'(?:page_|xml_page_|dynamic_tab_|internal_page_)?(.+)\.md', filename)
                        if match:
                            reversed_part = match.group(1).replace('_', '/')
                            if reversed_part.startswith('www.mosdac.gov.in'):
                                source_url_from_filename = 'https://' + reversed_part
                            elif 'mosdac.gov.in' in reversed_part:
                                source_url_from_filename = 'https://' + reversed_part
                            else:
                                source_url_from_filename = 'https://www.mosdac.gov.in/' + reversed_part.lstrip('/')

                        unified_documents.append({
                            "id": f"md_doc_{doc_id_counter}",
                            "source_url": source_url_from_filename,
                            "content_type_detail": "web_page_markdown",
                            "original_file_path": filepath,
                            "text_content": content.strip()
                        })
                except Exception as e:
                    print(f"  ❌ Error processing markdown file {filepath}: {e}")
    print(f"Processed {len(unified_documents)} markdown documents.")

    # --- 2. Process Structured JSON (FAQs) ---
    print(f"\nProcessing Structured JSON (FAQs) from: {STRUCTURED_JSON_DIR}")
    initial_unified_count = len(unified_documents)
    if not os.path.exists(STRUCTURED_JSON_DIR):
        print(f"  ⚠️ Warning: FAQ directory '{STRUCTURED_JSON_DIR}' does not exist. Skipping.")
    else:
        for filename in os.listdir(STRUCTURED_JSON_DIR):
            if filename.endswith(".json"):
                filepath = os.path.join(STRUCTURED_JSON_DIR, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        faq_data = json.load(f)

                    for i, faq_item in enumerate(faq_data):
                        question = faq_item.get("question", "")
                        answer = faq_item.get("answer", "")
                        if question.strip() and answer.strip():
                            doc_id_counter += 1
                            source_url_from_filename = "https://www.mosdac.gov.in/unknown_faq_source"
                            match = re.match(r'faq_(.+)\.json', filename)
                            if match:
                                reversed_part = match.group(1).replace('_', '/')
                                if reversed_part.startswith('www.mosdac.gov.in'):
                                    source_url_from_filename = 'https://' + reversed_part
                                elif 'mosdac.gov.in' in reversed_part:
                                    source_url_from_filename = 'https://' + reversed_part
                                else:
                                    source_url_from_filename = 'https://www.mosdac.gov.in/' + reversed_part.lstrip('/')

                            unified_documents.append({
                                "id": f"faq_doc_{doc_id_counter}",
                                "source_url": source_url_from_filename,
                                "content_type_detail": "faq_item",
                                "original_file_path": filepath,
                                "text_content": f"Question: {question}\nAnswer: {answer}"
                            })
                except Exception as e:
                    print(f"  ❌ Error processing FAQ JSON file {filepath}: {e}")
    print(f"Processed {len(unified_documents) - initial_unified_count} FAQ documents.")

    # --- 3. Process Extracted Tables JSON ---
    print(f"\nProcessing Extracted Tables JSON from: {EXTRACTED_TABLES_DIR}")
    initial_unified_count_tables = len(unified_documents)
    if not os.path.exists(EXTRACTED_TABLES_DIR):
        print(f"  ⚠️ Warning: Tables directory '{EXTRACTED_TABLES_DIR}' does not exist. Skipping.")
    else:
        for filename in os.listdir(EXTRACTED_TABLES_DIR):
            if filename.endswith(".json"):
                filepath = os.path.join(EXTRACTED_TABLES_DIR, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        table_data = json.load(f)

                    for i, table_entry in enumerate(table_data):
                        readable_text_content = "\n".join([f"{k}: {v}" for k, v in table_entry.items() if k and v])

                        if readable_text_content.strip():
                            doc_id_counter += 1
                            source_url_from_filename = "https://www.mosdac.gov.in/unknown_table_source"
                            match = re.match(r'(?:table_|angular_table_)(.+)\.json', filename)
                            if match:
                                reversed_part = match.group(1).replace('_', '/')
                                if reversed_part.startswith('www.mosdac.gov.in'):
                                    source_url_from_filename = 'https://' + reversed_part
                                elif 'mosdac.gov.in' in reversed_part:
                                    source_url_from_filename = 'https://' + reversed_part
                                else:
                                    source_url_from_filename = 'https://www.mosdac.gov.in/' + reversed_part.lstrip('/')

                            unified_documents.append({
                                "id": f"table_doc_{doc_id_counter}",
                                "source_url": source_url_from_filename,
                                "content_type_detail": "table_row_json",
                                "original_file_path": filepath,
                                "text_content": readable_text_content
                            })
                except Exception as e:
                    print(f"  ❌ Error processing Table JSON file {filepath}: {e}")
    print(f"Processed {len(unified_documents) - initial_unified_count_tables} table documents.")

    # --- 4. Process Extracted Links JSON ---
    print(f"\nProcessing Extracted Links JSON from: {EXTRACTED_LINKS_DIR}")
    initial_unified_count_links = len(unified_documents)
    if not os.path.exists(EXTRACTED_LINKS_DIR):
        print(f"  ⚠️ Warning: Links directory '{EXTRACTED_LINKS_DIR}' does not exist. Skipping.")
    else:
        for filename in os.listdir(EXTRACTED_LINKS_DIR):
            if filename.endswith(".json"):
                filepath = os.path.join(EXTRACTED_LINKS_DIR, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        links_data = json.load(f)

                    for i, link_entry in enumerate(links_data):
                        link_text = link_entry.get("link_text", "")
                        target_url = link_entry.get("target_url", "")
                        context_snippet = link_entry.get("context_snippet", "")
                        source_page = link_entry.get("source_page", "")

                        text_content = f"Link Text: {link_text}\nTarget URL: {target_url}\nContext: {context_snippet}"

                        if text_content.strip():
                            doc_id_counter += 1
                            unified_documents.append({
                                "id": f"link_doc_{doc_id_counter}",
                                "source_url": source_page if source_page else "https://www.mosdac.gov.in/unknown_link_source",
                                "content_type_detail": "extracted_link_context",
                                "original_file_path": filepath,
                                "text_content": text_content.strip()
                            })
                except Exception as e:
                    print(f"  ❌ Error processing Links JSON file {filepath}: {e}")
    print(f"Processed {len(unified_documents) - initial_unified_count_links} link documents.")


    # --- 5. Process PDF Files ---
    print(f"\nProcessing PDF files from: {PDFS_DIR}")
    initial_unified_count_pdfs = len(unified_documents)
    if not os.path.exists(PDFS_DIR):
        print(f"  ⚠️ Warning: PDF directory '{PDFS_DIR}' does not exist. Skipping.")
    else:
        for filename in os.listdir(PDFS_DIR):
            if filename.lower().endswith(".pdf"):
                filepath = os.path.join(PDFS_DIR, filename)
                text_content = extract_text_from_pdf(filepath)

                if text_content.strip():
                    doc_id_counter += 1
                    source_url_from_filename = "https://www.mosdac.gov.in/unknown_pdf_source"
                    match = re.match(r'doc_(.+)\.pdf', filename)
                    if match:
                        reversed_part = match.group(1).replace('_', '/')
                        if reversed_part.startswith('www.mosdac.gov.in'):
                            source_url_from_filename = 'https://' + reversed_part
                        elif 'mosdac.gov.in' in reversed_part:
                            source_url_from_filename = 'https://' + reversed_part
                        else:
                            source_url_from_filename = 'https://www.mosdac.gov.in/' + reversed_part.lstrip('/')

                    unified_documents.append({
                        "id": f"pdf_doc_{doc_id_counter}",
                        "source_url": source_url_from_filename,
                        "content_type_detail": "pdf_text",
                        "original_file_path": filepath,
                        "text_content": text_content.strip()
                    })
    print(f"Processed {len(unified_documents) - initial_unified_count_pdfs} PDF documents.")

    # --- Final Save ---
    print(f"\n--- Saving Unified Corpus ---")
    try:
        with open(OUTPUT_UNIFIED_FILE, "w", encoding="utf-8") as f:
            json.dump(unified_documents, f, indent=2, ensure_ascii=False)
        print(f"✅ Successfully created unified corpus with {len(unified_documents)} documents at {OUTPUT_UNIFIED_FILE}")
    except Exception as e:
        print(f"  ❌ Error saving unified corpus: {e}")

# Run the function
if __name__ == "__main__":
    # Apply nest_asyncio to allow asyncio.run() in Colab/Jupyter environments
   import nest_asyncio
import asyncio

nest_asyncio.apply()  # Allows re-entry into existing event loop (e.g., Jupyter)

# Now safely run your async function
await create_unified_corpus()


Dependencies installed.
Unified corpus will be saved to: /content/drive/MyDrive/extracted_content/unified_corpus.json
PDF extraction helper function defined.

--- Starting Unified Corpus Creation ---
Reading data from: /content/drive/MyDrive/extracted_content

Processing Unstructured Markdown files from: /content/drive/MyDrive/extracted_content/unstructured_markdown
Processed 647 markdown documents.

Processing Structured JSON (FAQs) from: /content/drive/MyDrive/extracted_content/structured_json
Processed 34 FAQ documents.

Processing Extracted Tables JSON from: /content/drive/MyDrive/extracted_content/extracted_tables
Processed 301 table documents.

Processing Extracted Links JSON from: /content/drive/MyDrive/extracted_content/extracted_links
Processed 66856 link documents.

Processing PDF files from: /content/drive/MyDrive/extracted_content/pdfs
Processed 0 PDF documents.

--- Saving Unified Corpus ---
✅ Successfully created unified corpus with 67838 documents at /content/drive/MyDri