In [2]:
import os
import json
from dotenv import load_dotenv
import gradio as gr
from pydantic import BaseModel, Field
from typing import List

import pytesseract
from PIL import Image
import cv2
import PyPDF2
from docx import Document as DocxDocument

from langchain_google_genai import GoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain_core.output_parsers import PydanticOutputParser, StrOutputParser
from sentence_transformers import SentenceTransformer
import numpy as np

In [3]:
load_dotenv()

True

In [4]:
class ChunkSummary(BaseModel):
    summary: str = Field(description="Brief summary of the chunk")
    key_topics: List[str] = Field(description="Key topics in this chunk")
    importance: str = Field(description="Importance level: high/medium/low")

class DocumentInfo(BaseModel):
    title: str = Field(description="Inferred document title")
    document_type: str = Field(description="Type of document")
    estimated_pages: int = Field(description="total number of pages")
    language: str = Field(description="Primary language")
    subject_area: str = Field(description="Main subject domain")

class ContentAnalysis(BaseModel):
    summary: str = Field(description="Comprehensive document summary")
    key_topics: List[str] = Field(description="Main topics covered")
    main_entities: List[str] = Field(description="Key entities mentioned")
    themes: List[str] = Field(description="Main themes")

class SemanticTags(BaseModel):
    categories: List[str] = Field(description="Document categories")
    keywords: List[str] = Field(description="Important keywords")
    classification: List[str] = Field(description="Classification tags")

class FinalMetadata(BaseModel):
    document_info: DocumentInfo
    content_analysis: ContentAnalysis
    semantic_tags: SemanticTags

In [5]:
llm = GoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.1
)

from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

chunk_parser = PydanticOutputParser(pydantic_object=ChunkSummary)
metadata_parser = PydanticOutputParser(pydantic_object=FinalMetadata)
str_parser = StrOutputParser()

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", "! ", "? ", " "]
)

In [7]:
chunk_summary_prompt = PromptTemplate(
    template="""
    Analyze this document chunk and create a brief summary:

    Chunk: {chunk_text}

    {format_instructions}
    """,
    input_variables=["chunk_text"],
    partial_variables={"format_instructions": chunk_parser.get_format_instructions()}
)

final_metadata_prompt = PromptTemplate(
    template="""
    Based on the most relevant document chunks, generate comprehensive metadata for document classification and discoverability:

    Relevant Chunks:
    {relevant_chunks}

    {format_instructions}
    """,
    input_variables=["relevant_chunks"],
    partial_variables={"format_instructions": metadata_parser.get_format_instructions()}
)

In [8]:
chunk_chain = chunk_summary_prompt | llm | chunk_parser
metadata_chain = final_metadata_prompt | llm | metadata_parser

In [9]:
def process_document(uploaded_file):
    
    
    if uploaded_file is None:
        return "Please upload a file"
    
    try:
        # Step 1: Extract text from uploaded file
        file_path = uploaded_file.name
        ext = os.path.splitext(file_path)[1].lower()
        
        if ext == ".pdf":
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                actual_pages = len(pdf_reader.pages)  # Get actual pages
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
        elif ext == ".docx":
            doc = DocxDocument(file_path)
            text = "\n".join([p.text for p in doc.paragraphs])
            # Estimate pages for DOCX based on content length
            total_chars = len(text)
            actual_pages = max(1, round(total_chars / 2500))  # ~2500 chars per page
        elif ext == ".txt":
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read().strip()
            # Estimate pages for TXT based on content length
            actual_pages = max(1, round(len(text) / 2500))  # ~2500 chars per page
        elif ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]:
            image = cv2.imread(file_path)
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            text = pytesseract.image_to_string(gray)
            actual_pages = 1  # Images are always 1 page
        else:
            return "Unsupported file type. Please upload PDF, DOCX, TXT, or image files."
        
        if len(text.strip()) < 100:
            return "Document appears to be empty or text extraction failed. Please try a different file."
        
        # Step 2: Chunk the text
        chunks = text_splitter.split_text(text)
        documents = [Document(page_content=chunk, metadata={"chunk_id": i}) for i, chunk in enumerate(chunks)]
        
        # # Step 3: Create embeddings locally using sentence transformers
        # chunk_embeddings = embedding_model.encode([doc.page_content for doc in documents])
        
        # Step 4: Store in ChromaDB with local embeddings
        persist_dir = "./chroma_db_temp"
        vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=embedding_model,
            persist_directory=persist_dir
        )
        vectorstore.persist()
        
        # Step 5: Retrieve most relevant chunks for metadata generation
        overview_query = f"main content summary key topics overview {text[:500]}"
        relevant_docs = vectorstore.similarity_search(overview_query, k=5)
        
        # Step 6: Generate chunk summaries using Gemini with Pydantic validation
        chunk_summaries = []
        for doc in relevant_docs:
            try:
                chunk_result = chunk_chain.invoke({"chunk_text": doc.page_content[:800]})
                chunk_summaries.append(f"Summary: {chunk_result.summary}, Topics: {', '.join(chunk_result.key_topics)}, Importance: {chunk_result.importance}")
            except Exception as e:
                print(f"Chunk processing error: {e}")
                continue
        
        # Step 7: Create final metadata using Gemini with Pydantic validation
        combined_chunks = "\n\n".join([f"Chunk {i+1}: {summary}" for i, summary in enumerate(chunk_summaries)])
        metadata_result = metadata_chain.invoke({"relevant_chunks": combined_chunks})
        
        # Step 8: Structure the final output
        
        
        final_output = {
            "document_info": {
                "title": metadata_result.document_info.title,
                "document_type": metadata_result.document_info.document_type,
                "estimated_pages": actual_pages,
                "language": metadata_result.document_info.language,
                "subject_area": metadata_result.document_info.subject_area
            },
            "content_analysis": {
                "summary": metadata_result.content_analysis.summary,
                "key_topics": metadata_result.content_analysis.key_topics,
                "main_entities": metadata_result.content_analysis.main_entities,
                "themes": metadata_result.content_analysis.themes
            },
            "semantic_tags": {
                "categories": metadata_result.semantic_tags.categories,
                "keywords": metadata_result.semantic_tags.keywords,
                "classification": metadata_result.semantic_tags.classification
            },
            "processing_info": {
                "total_chunks": len(chunks),
                "processed_chunks": len(chunk_summaries),
            
                "file_size_kb": round(os.path.getsize(file_path)/1024, 1)
                
            }
        }
        
        # Clean up temporary ChromaDB
        try:
            import shutil
            shutil.rmtree(persist_dir)
        except:
            pass
        
        return json.dumps(final_output, indent=2)
        
    except Exception as e:
        return f"Error processing document: {str(e)}"

In [10]:
# Gradio interface
interface = gr.Interface(
    fn=process_document,
    inputs=[
        gr.File(
            label="Upload Document", 
            file_types=[".pdf", ".docx", ".txt", ".png", ".jpg", ".jpeg", ".bmp", ".tiff"],
            file_count="single"
        )
    ],
    outputs=[
        gr.Textbox(
            label="Generated Metadata (JSON)", 
            lines=20, 
            max_lines=30,
            show_copy_button=True
        )
    ],
    title="🤖 Automated Metadata Generation System",
    description="""
    Upload any document (PDF, DOCX, TXT, or Image) up to 8MB to automatically generate comprehensive metadata.
    
    **Features:**
    - ✅ Multi-format support (PDF, DOCX, TXT, Images)
    - ✅ OCR for image-based documents
    - ✅ Semantic content identification
    - ✅ Local embedding generation
    - ✅ Structured metadata output
    - ✅ Document classification and tagging
    
    **Supported file types:** PDF, DOCX, TXT, PNG, JPG, JPEG, BMP, TIFF
    """,
    examples=[],
    cache_examples=False,
    theme=gr.themes.Soft(),
    allow_flagging="never"
)



In [11]:
import gradio as gr
gr.close_all()  # ⛔ Close any existing Gradio interfaces

In [12]:
if __name__ == "__main__":
    interface.launch(
        share=False,
        server_name="0.0.0.0",
        server_port=7861,
        show_error=True,
        quiet=False
    )

* Running on local URL:  http://0.0.0.0:7861
* To create a public link, set `share=True` in `launch()`.


  vectorstore.persist()


In [13]:
gr.close_all()

Closing server running on port: 7861


In [14]:
gr.close_all()

Closing server running on port: 7861
