In [1]:
! pip install pypdf

Collecting pypdf
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.1.1-py3-none-any.whl (323 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/323.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m317.4/323.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.1.1


In [2]:
import os
import json
import sys
from pypdf import PdfReader # Library for PDF text extraction
from pypdf.errors import PdfReadError # Correct import for pypdf exceptions
from google import genai
from google.genai import types
from google.genai.errors import APIError

# --- Configuration ---

MODEL_NAME = "gemini-2.5-flash"

from getpass import getpass

GEMINI_KEY_STRING = os.getenv("GEMINI_KEY_STRING")
if not GEMINI_KEY_STRING:
    GEMINI_KEY_STRING = getpass("Enter your Gemini API key (hidden): ")


try:
    client = genai.Client(api_key=GEMINI_KEY_STRING if GEMINI_KEY_STRING else None)
except Exception as e:
    print(f"Error initializing Gemini client: {e}")
    print("Please set your Gemini API key in the 'GEMINI_KEY_STRING' variable or as the 'GEMINI_API_KEY' environment variable.")
    sys.exit(1)


# --- 1. PDF Text Extraction Function ---

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts all readable text from a PDF file.

    Args:
        pdf_path: The local path to the PDF file.

    Returns:
        A single string containing the extracted text, or an empty string on failure.
    """
    print(f"Attempting to extract text from: {pdf_path}...")
    try:
        reader = PdfReader(pdf_path)
        text = ""

        for i, page in enumerate(reader.pages):

            if i < 5:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n\n"
            else:
                break

        if not text.strip():
            print("WARNING: Extracted text is empty or only whitespace.")
            return ""

        print(f"Extraction successful. Total characters extracted (first 5 pages): {len(text)}")
        return text
    except FileNotFoundError:
        print(f"ERROR: PDF file not found at path: {pdf_path}")
        return ""
    except PdfReadError:
        print(f"ERROR: Could not read or process PDF file: {pdf_path}. Is it password-protected or corrupt?")
        return ""
    except Exception as e:
        print(f"An unexpected error occurred during PDF extraction: {e}")
        return ""


# --- 2. LLM Prompt Engineering & Schema Definition ---

SYSTEM_INSTRUCTION = """
You are a Senior Project Risk Analyst specializing in Detailed Project Report (DPR) analysis.
Your primary task is to assess the provided project summary against a mandated set of risk criteria.
Based on the project description, assign a risk score (1-10, where 10 is highest risk) for each category
and list specific findings or gaps identified.

MANDATORY RISK PREDICTION FACTORS TO CHECK:

1. Cost Overrun Risk:
   - Cost estimates without SOR (Schedule of Rates) basis.
   - Missing contingency provisions (e.g., 10-15% of project cost).
   - Historical data showing similar projects exceeded budget (Infer if project complexity/type suggests risk).
   - EPC (Engineering-Procurement-Construction) mode not adopted for infrastructure projects.

2. Delay Risk:
   - No provision for Liquidated Damages (LD) in the contract.
   - Unrealistic timelines compared to similar projects.
   - Missing statutory clearances (environmental, forest, etc.).
   - Land not available/encumbrance issues.

3. Implementation Risk (Assume project > Rs. 100 Crore if size suggests large infrastructure):
   - Missing third-party monitoring provision (for projects >Rs. 100 crore).
   - Weak or generic monitoring mechanism.
   - No mandatory quarterly reporting structure.
   - Missing KPIs for progress tracking.

4. Sustainability Risk:
   - No O&M mechanism outlined beyond 4 years.
   - Missing long-term sustainability plan.
   - No community benefit framework or stakeholder engagement plan.
   - Environmental concerns are not adequately addressed or mitigating measures are missing.

You MUST return the assessment as a single JSON object conforming to the provided schema.
"""

RISK_ASSESSMENT_SCHEMA = types.Schema(
    type=types.Type.OBJECT,
    properties={
        "overallRiskScore": types.Schema(
            type=types.Type.STRING,
            description="Overall risk level (e.g., Low, Medium, High) with a numerical average score (e.g., 7.5/10)."
        ),
        "riskCategories": types.Schema(
            type=types.Type.ARRAY,
            description="Array of detailed risk assessments for each category.",
            items=types.Schema(
                type=types.Type.OBJECT,
                properties={
                    "categoryName": types.Schema(type=types.Type.STRING, description="e.g., Cost Overrun Risk"),
                    "score": types.Schema(type=types.Type.STRING, description="Risk score for this category (e.g., 8/10)"),
                    "findings": types.Schema(
                        type=types.Type.ARRAY,
                        description="Specific findings or gaps identified against the mandatory factors.",
                        items=types.Schema(type=types.Type.STRING)
                    )
                }
            )
        )
    }
)

# --- 3. Core Analysis Function ---

def analyze_dpr_risks(project_summary: str) -> dict | None:
    """
    Analyzes the project summary text using the Gemini API for structured risk assessment.

    Args:
        project_summary: The text extracted from the DPR.

    Returns:
        A dictionary containing the structured risk analysis, or None on failure.
    """
    if not project_summary:
        print("Analysis skipped: No project summary text provided.")
        return None

    user_prompt = f"Analyze the following project summary based on the system instructions:\n\n---\n{project_summary}\n---"

    try:
        response = client.models.generate_content(
            model=MODEL_NAME,
            contents=[user_prompt],
            config=types.GenerateContentConfig(
                system_instruction=SYSTEM_INSTRUCTION,
                response_mime_type="application/json",
                response_schema=RISK_ASSESSMENT_SCHEMA
            ),
        )


        analysis_json = response.text.strip()
        return json.loads(analysis_json)

    except APIError as e:
        print(f"API Error during content generation: {e}")
        print("Please check your API key, model usage limits, and prompt structure.")
        return None
    except json.JSONDecodeError:
        print("Error: Model response was not valid JSON. This may be due to an input length issue.")
        print(f"Raw response: {response.text}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during API call: {e}")
        return None

# --- 4. Result Formatting ---

def display_results(analysis_data: dict, file_path: str):
    """Prints the structured risk analysis in a readable format."""
    print("\n" + "="*80)
    print("                DPR PROJECT RISK ANALYSIS (GENERATED BY AI)               ")
    print("="*80 + "\n")
    print(f"ANALYSIS OF DPR FILE: {file_path}\n")

    print(f"Overall Project Risk: {analysis_data.get('overallRiskScore', 'N/A')}\n")

    for category in analysis_data.get('riskCategories', []):
        name = category.get('categoryName', 'Unknown Category')
        score = category.get('score', 'N/A')
        findings = category.get('findings', [])

        print(f"[{name}] - Risk Score: {score}")
        print("-" * (len(name) + 20))

        if findings:
            for i, finding in enumerate(findings, 1):
                if "missing" in finding.lower() or "not available" in finding.lower() or "unrealistic" in finding.lower() or "weak" in finding.lower() or "no provision" in finding.lower():
                    risk_level = " HIGH GAP"
                elif "strength" in finding.lower() or "well-addressed" in finding.lower() or "adopted" in finding.lower():
                    risk_level = " STRENGTH"
                else:
                    risk_level = " NOTE"

                print(f"  {i}. {risk_level}: {finding}")
        else:
            print("  No significant gaps or findings identified for this category.")

        print("\n")
    print("="*80)


# --- 5. Main Execution Block ---

def analyze_dpr_pdf(pdf_file_path: str) -> dict | None:
    dpr_text = extract_text_from_pdf(pdf_file_path)
    if not dpr_text:
        print("Failed to extract text from PDF.")
        return None

    analysis_result = analyze_dpr_risks(dpr_text)
    if not analysis_result:
        print("Risk analysis failed or returned no results.")
        return None

    return analysis_result


Enter your Gemini API key (hidden): ··········
