In [None]:
from google import genai
from pydantic import BaseModel
from typing import List, Dict, Any, Tuple


def prepare_document(file_path: str) -> Dict[str, Any]:
    """
    Prepares the document and gets initial response
    Returns: Dictionary with raw response
    """
    try:
        # Initialize the Google AI client
        client = genai.Client(api_key="AIzaSyD4lR1WQ1yaZumSFtMVTG_0Y8d0oRy1XhA")
        
        # Upload the file
        uploaded_file = client.files.upload(file=file_path)
        
        # First prompt for general analysis
        initial_prompt = """
        describe the image in detail. Include the following information in your response:
      Primary Subject and Setting: Identify the main subject(s) of the image and describe the setting or environment in which they are located.

Attributes and Characteristics: Detail the physical attributes, expressions, attire, and other distinguishing features of the primary subject(s).

Activities and Interactions: Explain any actions, events, or interactions taking place among subjects or between subjects and their environment.

Artistic Elements: Describe the artistic style, lighting, color palette, perspective, and any other visual elements that contribute to the overall aesthetic of the image.
        """
        
        # Get initial response
        initial_response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[uploaded_file, initial_prompt]
        )
        
        return {
            "success": True, 
            "uploaded_file": uploaded_file,
            "initial_response": initial_response.text
        }
        
    except Exception as e:
        return {"success": False, "error": f"Error preparing document: {str(e)}"}
    
response = prepare_document("F:/Aniruddha/code/webdev/PROJECTS/teacherassistant/backend/images.png")

In [35]:
print(response)

{'success': True, 'uploaded_file': File(name='files/7kn2mgnxhu10', display_name=None, mime_type='image/png', size_bytes=6988, create_time=datetime.datetime(2025, 3, 3, 19, 13, 22, 376205, tzinfo=TzInfo(UTC)), expiration_time=datetime.datetime(2025, 3, 5, 19, 13, 22, 361286, tzinfo=TzInfo(UTC)), update_time=datetime.datetime(2025, 3, 3, 19, 13, 22, 376205, tzinfo=TzInfo(UTC)), sha256_hash='MWU2OWYyMjM3ZWU4OWEyZWY2ZGMyZjVhYWVhZGFiMWFkOTg2NjY3MjQxNGZjMzBiOTViYWMzOGZiMjUzNzdkOA==', uri='https://generativelanguage.googleapis.com/v1beta/files/7kn2mgnxhu10', download_uri=None, state=<FileState.ACTIVE: 'ACTIVE'>, source=<FileSource.UPLOADED: 'UPLOADED'>, video_metadata=None, error=None), 'initial_response': 'Here\'s a structured analysis of the cartoon lion image:\n\n**1. Overall Description of the Image:**\n\nThe image depicts a cartoon illustration of a lion standing in a slightly angled, three-quarter view. It\'s a simple, cheerful representation suitable for children. The style is clean an

In [2]:
from google import genai
from pydantic import BaseModel
from typing import List, Dict, Any, Tuple

class PaperCheckResult(BaseModel):
    marks: int
    remarks: List[str]
    suggestions: List[str]
    errors: List[str]

def prepare_document(file_path: str) -> Dict[str, Any]:
    """
    Prepares the document and gets initial response
    Returns: Dictionary with raw response
    """
    try:
        # Initialize the Google AI client
        client = genai.Client(api_key="AIzaSyD4lR1WQ1yaZumSFtMVTG_0Y8d0oRy1XhA")
        
        # Upload the file
        uploaded_file = client.files.upload(file=file_path)
        
        # First prompt for general analysis
        initial_prompt = """
        Analyze this academic paper and provide feedback. Include:
        1. Overall quality score (0-100)
        2. Positive aspects of the paper
        3. Areas that need improvement
        4. Any errors or problems found
        """
        
        # Get initial response
        initial_response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=[uploaded_file, initial_prompt]
        )
        
        return {
            "success": True, 
            "uploaded_file": uploaded_file,
            "initial_response": initial_response.text
        }
        
    except Exception as e:
        return {"success": False, "error": f"Error preparing document: {str(e)}"}

def analyze_document(initial_result: Dict[str, Any]) -> Dict[str, Any]:
    """
    Takes initial response and converts it to structured format
    Returns: Dictionary with structured results
    """
    try:
        if not initial_result["success"]:
            return initial_result
            
        client = genai.Client(api_key="AIzaSyD4lR1WQ1yaZumSFtMVTG_0Y8d0oRy1XhA")
        
        # Second prompt to structure the response
        structure_prompt = f"""
        Convert the following feedback into a structured JSON format:

        {initial_result['initial_response']}

        The JSON should have this structure:
        {{
            "marks": integer (0-100),
            "remarks": [list of positive comments],
            "suggestions": [list of improvement areas],
            "errors": [list of problems found]
        }}
        """
        
        # Get structured response
        structured_response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=structure_prompt,
            config={
                'response_mime_type': 'application/json'
            }
        )
        
        # Parse the response
        data = structured_response.text
        if isinstance(data, str):
            data = eval(data)
        
        if not isinstance(data, list):
            data = [data]
            
        results = [PaperCheckResult(**item) for item in data]
        return {"success": True, "results": [r.model_dump() for r in results]}
        
    except Exception as e:
        return {"success": False, "error": str(e)}

def process_document(file_path: str) -> Dict[str, Any]:
    """
    Main function that coordinates the document processing
    """
    try:
        # First get raw analysis
        initial_result = prepare_document(file_path)
        if not initial_result["success"]:
            return initial_result
            
        # Then convert to structured format
        return analyze_document(initial_result)
        
    except Exception as e:
        return {"success": False, "error": str(e)}

if __name__ == "__main__":
    file_path = 'spcc3.pdf'
    result = process_document(file_path)
    
    if result["success"]:
        print("Analysis Results:")
        for paper_result in result["results"]:
            print(f"Marks: {paper_result['marks']}")
            print(f"Remarks: {paper_result['remarks']}")
            print(f"Suggestions: {paper_result['suggestions']}")
            print(f"Errors: {paper_result['errors']}")
            print("-" * 50)
    else:
        print("Error:", result["error"])

Analysis Results:
Marks: 65
Remarks: ['The paper has a reasonably clear structure, with defined sections for theory, code, and output.', 'The provided C code appears to implement a shift-reduce parser, which is the stated goal.', 'Including the example output is good practice as it helps demonstrate how the parser works with a specific input.', 'The "Theory" section provides a decent overview of shift-reduce parsing concepts.']
Suggestions: ['While the theory section introduces the concepts, it could benefit from more in-depth explanations and examples.', 'The C code lacks comments, making it harder to understand the logic behind each step, especially for readers unfamiliar with the algorithm. Add inline comments.', "The code doesn't appear to have robust error handling. It could be enhanced to detect and report syntax errors.", "The program takes grammar rules as input. It's critical to specify the format and restrictions of the accepted grammar. Is it context-free? What are the limit

In [31]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datetime import datetime

def load_web_document(url):
    """
    Load a document from a specified URL using WebBaseLoader.

    Args:
    - url (str): The URL of the webpage to load.

    Returns:
    - docs: The loaded document(s).
    """
    loader = WebBaseLoader(url)
    docs = loader.load()
    return docs

def extract_title_and_split_content(docs):
    """
    Extract title and split content into chunks using RecursiveCharacterTextSplitter.

    Args:
    - docs: The documents loaded from WebBaseLoader.

    Returns:
    - tuple: (title, content_chunks)
    """
    if not docs:
        return None, []
    
    # Get title from the first document's metadata
    title = docs[0].metadata.get('title')
    
    # Add source metadata
    for doc in docs:
        doc.metadata.update({
            "source_type": "url",
            "timestamp": datetime.now().isoformat()
        })
    
    # Apply text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    content_chunks = text_splitter.split_documents(docs)
    
    return title, content_chunks

# Example usage
url = "https://timesofindia.indiatimes.com/city/mumbai/pocso-case-20-years-ri-for-father-after-fearless-deposition-by-thane-girl/articleshow/118688947.cms"
docs = load_web_document(url)
title, content_chunks = extract_title_and_split_content(docs)

print(f"Title: {title}")
print(f"\nNumber of content chunks: {len(content_chunks)}")
print("\nSample chunks:")
for i, chunk in enumerate(content_chunks[:3]):  # Print first 3 chunks as sample
    print(f"\nChunk {i+1}:")
    print(chunk.page_content[:150] + "...")  # Print first 150 chars of each chunk

Title: Pocso case: 20 years’ RI for father after fearless deposition by Thane girl | Mumbai News - The Times of India

Number of content chunks: 14

Sample chunks:

Chunk 1:
Pocso case: 20 years’ RI for father after fearless deposition by Thane girl | Mumbai News - The Times of IndiaEditionININUSSign InTOICitymumbaimumbai ...

Chunk 2:
20 years’ RI for father after fearless deposition by Thane girlTrendingUttarakhand AvalancheHimani Narwal MurderAyodhya Ram Mandir FootwearAmroha Cat ...

Chunk 3:
rigorous imprisonment (RI).Judge DS Deshmukh imposed a fine of Rs 20,000 and directed that it be disbursed to the victim as compensation and furthermo...


In [None]:
from datetime import datetime
import bs4
import re
from typing import List, Optional
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def process_web(url: str, chunk_size: int = 1000, chunk_overlap: int = 200, 
                specific_classes: Optional[List[str]] = None) -> list:
    """
    Process web URL and extract content with metadata.
    
    Args:
        url: The URL to process
        chunk_size: Size of text chunks
        chunk_overlap: Overlap between chunks
        specific_classes: Optional list of CSS classes to target
        
    Returns:
        List of document chunks with metadata
    """
    try:
        # Determine if we should use targeted parsing or general parsing
        bs_kwargs = {}
        if specific_classes:
            bs_kwargs = {
                "parse_only": bs4.SoupStrainer(
                    class_=tuple(specific_classes)
                )
            }
        
        # Load and parse webpage content
        loader = WebBaseLoader(
            web_paths=(url,),
            bs_kwargs=bs_kwargs
        )
        documents = loader.load()
        
        print(f"📄 Raw document count: {len(documents)}")
        if documents:
            print(f"📝 Sample content: {documents[0].page_content[:200]}...")
        else:
            print("⚠️ No content extracted. Trying again with general parsing...")
            # Fallback to general parsing
            loader = WebBaseLoader(web_paths=(url,))
            documents = loader.load()
            if documents:
                print(f"📄 Raw document count after fallback: {len(documents)}")
                print(f"📝 Sample content: {documents[0].page_content[:200]}...")
        
        # Add source metadata
        for doc in documents:
            doc.metadata.update({
                "source_type": "web_page",
                "url": url,
                "timestamp": datetime.now().isoformat(),
                "title": extract_title_from_url(url)
            })
            
        # Split documents into manageable chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        chunks = text_splitter.split_documents(documents)
        return chunks
    
    except Exception as e:
        print(f"🌐 Web processing error: {str(e)}")
        return []

def extract_title_from_url(url: str) -> str:
    """Extract a readable title from a URL"""
    # Remove protocol and domain
    path = re.sub(r'^https?://[^/]+/', '', url)
    # Remove query parameters and fragments
    path = re.sub(r'[?#].*$', '', path)
    # Replace underscores and hyphens with spaces
    title = re.sub(r'[-_]', ' ', path)
    # Split by slashes and take the last meaningful part
    parts = [p for p in title.split('/') if p]
    return parts[-1].title() if parts else "Web Page"

# Test the function with a Wikipedia page
common_content_classes = ["content", "main", "article", "post", "entry", "page-content", "body-content"]
web_chunks = process_web("https://en.wikipedia.org/wiki/Elden_Ring", 
                         specific_classes=common_content_classes)
print(f"Extracted {len(web_chunks)} chunks from the webpage")
if web_chunks:
    print(f"First chunk sample: {web_chunks[0].page_content[:100]}...")
    print(f"Metadata: {web_chunks[0].metadata}")


📄 Raw document count: 1
📝 Sample content: ...
Extracted 0 chunks from the webpage


In [16]:
def google_search(query: str) -> tuple:
    """
    Perform a Google search using Gemini's built-in search capability.
    Returns a tuple containing (text_response, search_links)
    """
    try:
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=query,
            config=types.GenerateContentConfig(
                temperature=0.2,
                tools=[types.Tool(
                    google_search=types.GoogleSearchRetrieval()
                )]
            )
        )
        
        # Extract links from citations and grounding metadata
        links = []
        if hasattr(response, 'candidates') and response.candidates:
            for candidate in response.candidates:
                # Extract links from citation metadata
                if hasattr(candidate, 'citation_metadata') and candidate.citation_metadata:
                    for citation in candidate.citation_metadata.citations:
                        if hasattr(citation, 'url') and citation.url:
                            links.append(citation.url)
                # Extract links from grounding metadata
                if hasattr(candidate, 'grounding_metadata') and candidate.grounding_metadata:
                    for chunk in candidate.grounding_metadata.grounding_chunks:
                        if hasattr(chunk, 'web') and chunk.web:
                            links.append(chunk.web.uri)
        
        return response.text, links
    except Exception as e:
        print(f"🔍 Google search error: {str(e)}")
        return "", []

# Get both text and links
search_text, search_links = google_search("What Supreme court said to samay raina about his case today")

print("Search Results:")
print(search_text)
print("\nSource Links:")
for i, link in enumerate(search_links, 1):
    print(f"{i}. {link}")


Search Results:
Today, March 3, 2025, the Supreme Court reprimanded comedian Samay Raina for commenting on the ongoing case involving his show 'India's Got Latent' while in Canada. Justice Surya Kant stated, "These youngsters may think that we are outdated, but we know how to deal with them. Don't take the court lightly." The court felt that Raina was being "oversmart" and didn't understand the Court's jurisdiction.


Source Links:
1. https://vertexaisearch.cloud.google.com/grounding-api-redirect/AQXblrx025Ewkb5yZl56U3FxsImwts2FXfzn9WN1XmQvhjr8zj6sY3t9TRT5phE57dgCkFWhpe-IOIBI4ctWdV3cHGFxTC6mMVC-v6LsPOVT_BxWGP7rjtLF4MM_UpkQgoNzFBdptBc-1j5OBjeFy502Vep4QTuRoCoNtldBVynDlnqwvZXxtH4vdn_SVCYH4pYl05XsK9GLTzA-QW-NRVLcgJigxYhC8T3-8ALEp6WlJ-8bupg5ukRHqI1KpgyUwiBcujSkc-sXG5MLUYHb7IkJyal7
2. https://vertexaisearch.cloud.google.com/grounding-api-redirect/AQXblrwpmE-N4-pSYbmG7mvz5Zeayy32NLnSF-JUmm80p4usjPu2DaaAz0TaVwN9zKELmlFuTcMiSxgGv5znPkwjL8UFmUYE5oMr4N7A3rNbHzqjvSYIMUk6ZsupI-yGWCWciQMJ5ZpYUr9yOA2

In [28]:
import os
from google import genai
from typing import Dict, Any
from pydantic import BaseModel

api_key = os.getenv("GEMINI_API_KEY", "AIzaSyD4lR1WQ1yaZumSFtMVTG_0Y8d0oRy1XhA")
client = genai.Client(api_key=api_key)

class GoogleSearchIntentResult(BaseModel):
    requires_search: bool



def detect_google_search_intent(query: str) -> bool:
    """
    Determines if the user's query requires internet access to answer properly.
    
    Args:
        query (str): The user's query
        
    Returns:
        bool: True if the query requires internet access, False otherwise
    """
   
    prompt = f"""You are an expert at determining when a query requires up-to-date information from the internet.
    
    Your task is to:
    1. Analyze the following user query
    2. Determine if an LLM without internet access can provide a satisfactory answer
    3. If the query likely needs internet access (e.g., current events, specific data, recent information), return: {{"requires_search": true}}
    4. If the query can be answered without internet access (e.g., general knowledge, coding help), return: {{"requires_search": false}}
    
    Examples requiring internet:
    - Current news or events
    - Recent statistics or data
    - Real-time information (weather, stocks)
    - Specific factual lookups that aren't common knowledge
    - Information that changes frequently
    
    Return ONLY the JSON object without any additional text.
    
    User query: {query}
    """
     
    try:
        
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=prompt,
            config={
                'response_mime_type': 'application/json',
                'response_schema': GoogleSearchIntentResult,
            },
        )
        
        result = response.parsed
        return result.requires_search
    except Exception as e:
        print(f"Error detecting search intent: {e}")
        return False

In [29]:
result = detect_google_search_intent("What happened today in india?")
print(f"Requires search: {result}")

Requires search: True
