In [28]:
import os
from google import genai
from typing import Dict, Any
from pydantic import BaseModel

api_key = os.getenv("GEMINI_API_KEY", "AIzaSyD4lR1WQ1yaZumSFtMVTG_0Y8d0oRy1XhA")
client = genai.Client(api_key=api_key)

class GoogleSearchIntentResult(BaseModel):
    requires_search: bool



def detect_google_search_intent(query: str) -> bool:
    """
    Determines if the user's query requires internet access to answer properly.
    
    Args:
        query (str): The user's query
        
    Returns:
        bool: True if the query requires internet access, False otherwise
    """
   
    prompt = f"""You are an expert at determining when a query requires up-to-date information from the internet.
    
    Your task is to:
    1. Analyze the following user query
    2. Determine if an LLM without internet access can provide a satisfactory answer
    3. If the query likely needs internet access (e.g., current events, specific data, recent information), return: {{"requires_search": true}}
    4. If the query can be answered without internet access (e.g., general knowledge, coding help), return: {{"requires_search": false}}
    
    Examples requiring internet:
    - Current news or events
    - Recent statistics or data
    - Real-time information (weather, stocks)
    - Specific factual lookups that aren't common knowledge
    - Information that changes frequently
    
    Return ONLY the JSON object without any additional text.
    
    User query: {query}
    """
     
    try:
        
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=prompt,
            config={
                'response_mime_type': 'application/json',
                'response_schema': GoogleSearchIntentResult,
            },
        )
        
        result = response.parsed
        return result.requires_search
    except Exception as e:
        print(f"Error detecting search intent: {e}")
        return False

In [8]:
import json
from google import genai
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Tuple, TypedDict, Union

class PaperCheckResult(BaseModel):
    Name: str = Field("", description="Paper taker's name or anything that hels identify the paper taker")
    marks: int
    remarks: List[str]
    suggestions: List[str]
    errors: List[str]

class ProcessResult(TypedDict):
    success: bool
    error: str | None
    results: List[Dict[str, Any]] | None

def prepare_document(file_path: str) -> Dict[str, Any]:
    """
    Prepares the document and gets initial response
    Returns: Dictionary with raw response
    """
    try:
        # Initialize the Google AI client
        client = genai.Client(api_key="AIzaSyD4lR1WQ1yaZumSFtMVTG_0Y8d0oRy1XhA")
        
        # Upload the file
        uploaded_file = client.files.upload(file=file_path)
        
        # First prompt for general analysis
        initial_prompt = """
        Analyze this academic paper and provide feedback. Include:
        1. Overall quality score (0-100)
        2. Positive aspects of the paper
        3. Areas that need improvement
        4. Any errors or problems found
        """
        
        # Get initial response
        initial_response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[uploaded_file, initial_prompt]
        )
        
        return {
            "success": True, 
            "uploaded_file": uploaded_file,
            "initial_response": initial_response.text
        }
        
    except Exception as e:
        return {"success": False, "error": f"Error preparing document: {str(e)}"}

def analyze_document(initial_result: Dict[str, Any]) -> Dict[str, Any]:
    """
    Takes initial response and converts it to structured format
    Returns: Dictionary with structured results
    """
    try:
        if not initial_result["success"]:
            return initial_result
            
        client = genai.Client(api_key="AIzaSyD4lR1WQ1yaZumSFtMVTG_0Y8d0oRy1XhA")
        
        structure_prompt = f"""
        Convert the following feedback into a structured JSON format:

        {initial_result['initial_response']}

        The JSON should have this structure:
        {{  "Name": "Roll No or name of the paper taker if found, otherwise empty string",
            "marks": integer (0-100) it should depend on how good remarks are and how many errors there are,
            "remarks": [list of positive comments],
            "suggestions": [list of improvement areas],
            "errors": [list of problems found]
        }}

        IMPORTANT: Ensure marks is a valid integer between 0 and 100. If no specific score is found, use 0.
        Ensure all arrays are empty lists [] instead of null when there are no items.
        Ensure Name is an empty string "" if no name is found.
        """
        
        # Get structured response
        structured_response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=structure_prompt,
            config={
                'response_mime_type': 'application/json'
            }
        )

        # Parse the response safely
        try:
            # Clean the response text to ensure it's valid JSON
            response_text = structured_response.text.strip()
            if response_text.startswith("```json"):
                response_text = response_text[7:]
            if response_text.endswith("```"):
                response_text = response_text[:-3]
            response_text = response_text.strip()
            
            data = json.loads(response_text)
            
            # Ensure data structure is correct and all fields are valid
            if isinstance(data, dict):
                data["marks"] = int(data.get("marks", 0))  # Convert to int, default to 0
                data["Name"] = str(data.get("Name", ""))  # Convert to string, default to empty string
                data["remarks"] = list(data.get("remarks", []))
                data["suggestions"] = list(data.get("suggestions", []))
                data["errors"] = list(data.get("errors", []))
            
        except json.JSONDecodeError as e:
            return {"success": False, "error": f"Failed to parse AI response: {str(e)}"}
        except (ValueError, TypeError) as e:
            return {"success": False, "error": f"Invalid value conversion: {str(e)}"}
        
        if not isinstance(data, list):
            data = [data]
            
        results = [PaperCheckResult(**item) for item in data]
        final_results = {"success": True, "results": [r.model_dump() for r in results]}
        return final_results
        
    except Exception as e:
        return {"success": False, "error": str(e)}

def process_document(file_path: str) -> ProcessResult:
    """
    Main function that coordinates the document processing
    """
    try:
        # First get raw analysis
        initial_result = prepare_document(file_path)
        if not initial_result["success"]:
            return {"success": False, "error": initial_result["error"], "results": None}
            
        # Then convert to structured format
        result = analyze_document(initial_result)
        if not result["success"]:
            return {"success": False, "error": result["error"], "results": None}
            
        return {"success": True, "error": None, "results": result["results"]}
        
    except Exception as e:
        return {"success": False, "error": str(e), "results": None}

if __name__ == "__main__":
    file_path = 'F:/Aniruddha/code/webdev/PROJECTS/teacherassistant/ai5.pdf'
    result = process_document(file_path)
    
    if result["success"]:
        for paper_result in result["results"]:
            print(f"Name: {paper_result['Name']}")
            print(f"Marks: {paper_result['marks']}")
            print(f"Remarks: {paper_result['remarks']}")
            print(f"Suggestions: {paper_result['suggestions']}")
            print(f"Errors: {paper_result['errors']}")
            print("-" * 50)
    else:
        print("Error:", result["error"])


Name: 
Marks: 78
Remarks: ['Clear and Concise Introduction', 'Well-Structured Theory Section', 'Step-by-Step Algorithm Implementation', 'Includes Advantages and Disadvantages', 'Applications Section', 'Code Implementation', 'Output Displayed', 'Conclusion']
Suggestions: ['Depth of Theoretical Explanation', 'Discussion of Parameters', 'Visualizations', 'More Detailed Analysis of Output', 'Comparisons with Other Algorithms', 'Code Comments', 'Introduction']
Errors: ['Minor Formatting', 'Grammar/Typos: "sollution" should be "solution."']
--------------------------------------------------


In [1]:
import tempfile
from datetime import datetime
from typing import List, Tuple, Optional
import os
import sys

import streamlit as st
import bs4
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from google import genai

def prepare_document(file_path: str) -> List[Document]:
    """
    Processes any document type using Gemini API and returns it in a format
    compatible with the vector storage system.
    
    Args:
        file_path (str): Path to the document file
        
    Returns:
        List[Document]: List containing the processed document
    """
    try:
        # Handle API key retrieval for both Streamlit and FastAPI environments
        if 'st' in globals() and hasattr(st, 'session_state'):
            # Streamlit environment
            api_key = st.session_state.get("google_api_key", os.getenv("GEMINI_API_KEY", ""))
        else:
            # FastAPI environment - get from environment only
            api_key = os.getenv("GEMINI_API_KEY", "")
            
        client = genai.Client(api_key=api_key)
        
        # Determine appropriate prompt based on file type
        file_extension = os.path.splitext(file_path)[1].lower()
        
        # Upload the file directly without opening it
        # The files.upload method will handle opening the file
        uploaded_file = client.files.upload(file=file_path)
        
        if file_extension in ['.png', '.jpg', '.jpeg', '.gif', '.webp']:
            prompt = """
            Please analyze and describe this image in detail. Include:
            1. Type of content and main subject
            2. Key information or features
            3. Visual elements and their significance
            4. Any text present in the image
            5. Overall meaning and context
            """
            source_type = "image"
        else:
            prompt = """
            Please analyze and summarize this document in detail. Include:
            1. Type of content and main subject
            2. Key information or facts
            3. Structure and organization
            4. Main arguments or points
            5. Overall context and significance
            """
            source_type = "document"
        
        # Generate content
        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[uploaded_file, prompt],
        )
        
        content = response.text
        print(f"Generated content length: {len(content)}")
        
        # Create a Document object
        doc = Document(
            page_content=content,
            metadata={
                "source_type": source_type,
                "file_name": os.path.basename(file_path),
                "timestamp": datetime.now().isoformat()
            }
        )
        
        # Apply text splitting
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        chunks = text_splitter.split_documents([doc])
        print(f"Number of document chunks: {len(chunks)}")
        
        return chunks
        
    except Exception as e:
        if 'st' in globals() and hasattr(st, 'session_state'):
            # Streamlit environment
            st.error(f"📄 Document processing error: {str(e)}")
        else:
            # FastAPI environment - print to console
            print(f"Document processing error: {str(e)}")
        # Re-raise the exception with a clearer message
        raise ValueError(f"No text content could be extracted from the file: {str(e)}")

# Adding direct testing capability when run as main
# ...existing code...

# Adding direct testing capability when run as main
if __name__ == "__main__":
    # ======= MANUALLY SET YOUR FILE PATH HERE =======
    manual_file_path = "F:/Aniruddha/code/webdev/PROJECTS/teacherassistant/your_document.pdf"
    # ===============================================
    
    # Use manually specified path by default
    test_file_path = manual_file_path
    
    # Command-line argument support (optional, can be removed if not needed)
    if len(sys.argv) > 1:
        test_file_path = sys.argv[1]
    
    # Check if file exists
    if not os.path.exists(test_file_path):
        print(f"Error: File not found at {test_file_path}")
        sys.exit(1)
        
    print(f"Processing document: {test_file_path}")
    
    try:
        # Process the document
        chunks = prepare_document(test_file_path)
        
        # Display results
        print(f"\nSuccessfully processed document into {len(chunks)} chunks")
        print("\nFirst chunk content preview:")
        if chunks:
            print(f"{chunks[0].page_content[:200]}...")
            print("\nMetadata:")
            for key, value in chunks[0].metadata.items():
                print(f"  {key}: {value}")
        
    except Exception as e:
        print(f"\nError processing document: {str(e)}")

USER_AGENT environment variable not set, consider setting it to identify your requests.


Error: File not found at --f="c:\Users\Aniruddha Chaudhari\AppData\Roaming\jupyter\runtime\kernel-v361c19affca1141d34862a6288978fe1575b341fb.json"


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
