INSTALLATIONS

In [None]:
pip install PyMuPDF pdfplumber PyPDF2 sentence-transformers rapidfuzz torch pytesseract pdf2image pillow




In [None]:
pip install PyPDF

Collecting PyPDF
  Downloading pypdf-6.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.1.1-py3-none-any.whl (323 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/323.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.5/323.5 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF
Successfully installed PyPDF-6.1.1


COMPLETENESS SCORE (GUIDELINE CHECKER)

In [None]:
import os
import fitz  # PyMuPDF
import torch
from sentence_transformers import SentenceTransformer, util
from rapidfuzz import fuzz
import pytesseract
from pdf2image import convert_from_path

# ======================================
# CONFIG
# ======================================
BATCH_SIZE = 16
SEMANTIC_THRESHOLD = 0.82
FUZZY_THRESHOLD = 75
SECTION_SCORE_THRESHOLD = 0.78
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ======================================
# MANDATORY SECTIONS AND KEY POINTS
# ======================================
MANDATORY_SECTIONS = {
    "Project Profile": [
        "Project name",
        "Project description",
        "Location(s) with geo-coordinates (latitude/longitude)",
        "Satellite image or photograph of project site",
        "Timeline for implementation"
    ],
    "Beneficiary & Impact Analysis": [
        "Expected beneficiaries (quantifiable)",
        "Socio-economic impact assessment",
        "Alignment with Sustainable Development Goals (SDG) or Multidimensional Poverty Index (MPI)",
        "Output-Outcome framework with Key Performance Indicators (KPIs)"
    ],
    "Technical Specifications": [
        "Technical details",
        "Technical design",
        "Alignment with GatiShakti Master Plan",
        "Compliance with concerned line department guidelines"
    ],
    "Financial Details": [
        "Cost estimates based on latest Schedule of Rates (SOR)",
        "CGST",
        "SGST",
        "All sources of funding",
        "Operations & Maintenance (O&M) cost for first 4 years included in total project cost"
    ],
    "Sustainability & Management": [
        "Sustainability plan",
        "Mechanism for O&M after project completion (beyond 4 years)",
        "Provision for project evaluation(s)"
    ],
    "Statutory Clearances": [
        "Forest & Environment clearance",
        "Town and Country Planning approval",
        "Industries clearance"
    ],
    "Required Certificates": [
        "Land availability certificate",
        "Cost certification (confirming costs are as per latest SOR)"
    ]
}

NDC_KEYWORDS = [
    "non-duplication certificate",
    "no duplication certificate",
    "certificate of non duplication",
    "project not duplicated",
    "no similar project",
    "duplicate project",
    "not taken up by other department",
    "not funded by other scheme",
    "unique project",
    "not funded by any other scheme",
    "not sanctioned under any other scheme",
    "project is unique and not repeated elsewhere"
    "not approved under other ministry",
    "funded only under this scheme",
    "exclusive project proposal",
    "no duplication under any ongoing project",
    "not covered under any other government program"
]

# Marks distribution per section
MARKS_DISTRIBUTION = {
    "Project Profile": 20/6,
    "Beneficiary & Impact Analysis": 20/6,
    "Technical Specifications": 20/6,
    "Financial Details": 20/6,
    "Sustainability & Management": 20/6,
    "Statutory Clearances": 20/6 +5,
    "Required Certificates": 10
}


Using device: cuda


In [None]:

# ======================================
# LOAD MODEL AND PRECOMPUTE EMBEDDINGS
# ======================================
model = SentenceTransformer('intfloat/e5-large-v2', device=device)

section_embeddings_dict = {}
for section, keypoints in MANDATORY_SECTIONS.items():
    long_keypoints = [kp for kp in keypoints if len(kp.split()) > 4]
    if long_keypoints:
        section_embeddings_dict[section] = model.encode(long_keypoints, batch_size=BATCH_SIZE,
                                                       convert_to_tensor=True, device=device)
    else:
        section_embeddings_dict[section] = []

# ======================================
# HELPER FUNCTIONS
# ======================================
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            t = page.get_text("text")
            if not t.strip():
                # Fallback to OCR if no text
                img = page.get_pixmap()
                from PIL import Image
                import io
                image = Image.open(io.BytesIO(img.tobytes("png")))
                t = pytesseract.image_to_string(image)
            text += t + "\n"
    return text

def validate_dpr(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    sentences = [s.strip() for s in text.split('\n') if len(s.strip()) > 5]

    if not sentences:
        print(f"{pdf_path} is empty!")
        return {}, {}

    sentence_embeddings = model.encode(sentences, batch_size=BATCH_SIZE,
                                       convert_to_tensor=True, device=device)

    results = {}
    missing_points = {}

    for section, keypoints in MANDATORY_SECTIONS.items():
        section_missing = []
        long_keypoints = [kp for kp in keypoints if len(kp.split()) > 4]
        short_keypoints = [kp for kp in keypoints if len(kp.split()) <= 4]

        # Short keypoints
        for kp in short_keypoints:
            max_fuzzy = max(fuzz.partial_ratio(kp.lower(), s.lower()) for s in sentences)
            if max_fuzzy < FUZZY_THRESHOLD:
                section_missing.append(kp)

        # Long keypoints
        if long_keypoints:
            keypoint_embeddings = section_embeddings_dict[section]
            for i, kp_emb in enumerate(keypoint_embeddings):
                cos_scores = util.cos_sim(kp_emb, sentence_embeddings)
                max_score = float(cos_scores.max())
                # print(f"[DEBUG] Section '{section}', Key Point: '{long_keypoints[i]}', Max similarity: {max_score:.2f}")
                if max_score < SEMANTIC_THRESHOLD:
                    section_missing.append(long_keypoints[i])


        score = 1 - len(section_missing)/len(keypoints)
        results[section] = {"present": score >= SECTION_SCORE_THRESHOLD, "score": score}
        if section_missing:
            missing_points[section] = section_missing


    # print(f"\nValidation Results for {os.path.basename(pdf_path)}:")
    # for sec, info in results.items():
    #     status = "Present" if info["present"] else " Missing"
    #     print(f"- {sec}: {status} (Score: {info['score']:.2f})")
    #     if sec in missing_points:
    #         print(f"    Missing key points: {missing_points[sec]}")

    return results, missing_points



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [None]:
# ============================================
# NON-DUPLICATION CHECKER
# ============================================

def check_non_duplication(text):
    print("\n=== NON-DUPLICATION CERTIFICATE CHECK ===")
    sentences = [s.strip() for s in text.split("\n") if len(s.strip()) > 20]

    found = []
    for sent in sentences:
        for kw in NDC_KEYWORDS:
            sim = util.cos_sim(
                model.encode(kw, convert_to_tensor=True),
                model.encode(sent, convert_to_tensor=True)
            ).item()
            fuzzy_score = fuzz.partial_ratio(kw.lower(), sent.lower()) / 100

            if sim > 0.75 and fuzzy_score > 0.85:
                found.append((sent, kw, sim, fuzzy_score))

    if found:
        print(" Possible Non-Duplication Certificate evidence found:")
        for sent, kw, sim, fuzzy in sorted(found, key=lambda x: max(x[2], x[3]), reverse=True)[:3]:
            print(f"  🔹 '{sent[:120]}...'  [match: {kw}]  (sim={sim:.2f}, fuzzy={fuzzy:.2f})")
        return True
    else:
        print(" No clear Non-Duplication evidence found.")
        return False


# ======================================
# MARKS CALCULATION
# ======================================
ndc_marks = 5
def compute_marks(results, missing_points):
    total_score = 0
    section_scores = {}

    for section, keypoints in MANDATORY_SECTIONS.items():
        marks_total = MARKS_DISTRIBUTION.get(section, 0)

        found_count = len(keypoints) - len(missing_points.get(section, []))

        section_mark = (found_count / len(keypoints)) * marks_total
        section_scores[section] = {"marks_obtained": section_mark,
                                   "marks_total": marks_total,
                                   "keypoints_found": found_count,
                                   "total_keypoints": len(keypoints)}
        total_score += section_mark

    return section_scores, total_score

# ======================================
# VALIDATE DPR AND COMPUTE MARKS
# ======================================
def validate_dpr_with_marks(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    results, missing_points = validate_dpr(pdf_path)
    section_scores, total_score = compute_marks(results, missing_points)
    ndc_found = check_non_duplication(text)
    if ndc_found:
          total_score += ndc_marks
    # print(f"\n--- Marks Breakdown ---")
    # for sec, info in section_scores.items():
    #     print(f"{sec}: {info['marks_obtained']:.2f}/{info['marks_total']} "
    #           f"({info['keypoints_found']}/{info['total_keypoints']} key points found)")

    print(f"\nOverall Score: {total_score:.2f} / {sum(MARKS_DISTRIBUTION.values())}\n")
    return section_scores, total_score, results, missing_points



TECHNICAL QUALITY SCORE

In [None]:
import re
import PyPDF2
import pdfplumber
import numpy as np
import os

try:
    import spacy
    NLP_AVAILABLE = True
    nlp = spacy.load("en_core_web_sm")
except:
    NLP_AVAILABLE = False
    print("⚠ spaCy not available. Install with: pip install spacy && python -m spacy download en_core_web_sm")

try:
    from sentence_transformers import SentenceTransformer
    SEMANTIC_AVAILABLE = True
    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
except:
    SEMANTIC_AVAILABLE = False
    print("Sentence-transformers not available. Install with: pip install sentence-transformers")


class TrueNLPDPRAnalyzer:
    def __init__(self, verbose=True):
        self.text = ""
        self.verbose = verbose
        self.scores = {
            'geo_coordinates': 0,
            'techno_economic': 0,
            'gatishakti': 0,
            'timeline': 0
        }
        self.evidence = {
            'geo_coordinates': [],
            'techno_economic': [],
            'gatishakti': [],
            'timeline': []
        }
        self.doc = None  # spaCy document

    def extract_text_from_pdf(self, pdf_path):

        text = ""

        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    try:
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text + "\n"
                    except Exception as e:
                        if self.verbose:
                            print(f" Page extraction: {str(e)[:50]}")
                        continue
        except Exception as e:
            if self.verbose:
                print(f" pdfplumber error: {str(e)[:50]}")

        if not text.strip():
            try:
                with open(pdf_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for i, page in enumerate(pdf_reader.pages):
                        try:
                            page_text = page.extract_text()
                            if page_text:
                                text += page_text + "\n"
                        except Exception as e:
                            if self.verbose:
                                print(f" PyPDF2 page {i}: {str(e)[:50]}")
                            continue
            except Exception as e:
                if self.verbose:
                    print(f" PyPDF2 error: {str(e)[:50]}")

        self.text = text

        # Process with spaCy NLP if available
        if NLP_AVAILABLE and text.strip():
            try:
                max_length = 1000000
                if len(text) > max_length:
                    self.doc = nlp(text[:max_length])
                else:
                    self.doc = nlp(text)
            except Exception as e:
                if self.verbose:
                    print(f" NLP processing: {str(e)[:50]}")
                self.doc = None

        return text

    # ==================== GEO-COORDINATES (TRUE NLP) ====================

    def score_geo_coordinates(self):

        score = 0

        if NLP_AVAILABLE and self.doc:
            locations = [ent.text for ent in self.doc.ents if ent.label_ in ['GPE', 'LOC', 'FAC']]

            coord_pattern = r'\d{1,3}[°]?\s*\d{1,2}[\'′]?\s*\d{0,2}[\"″]?\s*[NSEW]'
            coords = re.findall(coord_pattern, self.text, re.IGNORECASE)

            location_sentences = []
            for sent in self.doc.sents:
                sent_text_lower = sent.text.lower()
                if any(word in sent_text_lower for word in ['location', 'site', 'coordinate', 'latitude', 'longitude', 'chainage', 'alignment']):
                    location_sentences.append(sent.text)

            unique_locations = len(set(locations))
            coord_count = len(coords)

            if coord_count >= 2 or unique_locations >= 5:
                score = 5
                self.evidence['geo_coordinates'].append(
                    f"NLP: Found {unique_locations} locations, {coord_count} coordinates"
                )
            elif coord_count >= 1 or unique_locations >= 3:
                score = 4
                self.evidence['geo_coordinates'].append(
                    f"NLP: {unique_locations} locations identified"
                )
            elif unique_locations >= 2 or len(location_sentences) >= 3:
                score = 3
                self.evidence['geo_coordinates'].append(
                    f"NLP: {unique_locations} locations, {len(location_sentences)} location contexts"
                )
            elif unique_locations >= 1 or len(location_sentences) >= 1:
                score = 2
                self.evidence['geo_coordinates'].append("NLP: Basic location information detected")
            else:
                score = 0
                self.evidence['geo_coordinates'].append("NLP: No location entities found")

            if locations:
                unique_locs_list = list(set(locations))[:5]
                self.evidence['geo_coordinates'].append(f"Locations detected: {', '.join(unique_locs_list)}")

        else:
            # Fallback to pattern matching if NLP not available
            score = self._fallback_geo_score()

        self.scores['geo_coordinates'] = score
        return score

    def _fallback_geo_score(self):

        text = self.text.lower()
        location_terms = ['chainage', 'km ', 'station', 'alignment', 'latitude', 'longitude', 'district']
        location_count = sum(1 for term in location_terms if term in text)

        if location_count >= 5:
            score = 4
        elif location_count >= 3:
            score = 3
        elif location_count >= 1:
            score = 2
        else:
            score = 0

        self.evidence['geo_coordinates'].append(f"Fallback: {location_count} location indicators")
        return score

    # ==================== TECHNO-ECONOMIC ====================

    def score_techno_economic(self):
        score = 0

        if NLP_AVAILABLE and self.doc:
            money_entities = [ent.text for ent in self.doc.ents if ent.label_ == 'MONEY']
            cardinals = [ent.text for ent in self.doc.ents if ent.label_ == 'CARDINAL']

            orgs = [ent.text for ent in self.doc.ents if ent.label_ == 'ORG']

            cost_sentences = []
            for sent in self.doc.sents:
                for token in sent:
                    if token.lemma_ in ['cost', 'estimate', 'budget', 'expenditure', 'amount']:
                        cost_sentences.append(sent.text)
                        break


            tech_sentences = []
            for sent in self.doc.sents:
                sent_lower = sent.text.lower()
                if any(word in sent_lower for word in ['specification', 'design', 'technical', 'engineering', 'construction']):
                    tech_sentences.append(sent.text)


            indian_currency = re.findall(r'(?:rs\.?|₹)\s*[\d,]+(?:\.\d+)?(?:\s*(?:crore|lakh|cr|lac))?', self.text.lower())

            # Combine NLP entities with pattern matching
            total_financial = len(money_entities) + len(indian_currency)

            feature_score = 0
            if total_financial >= 15:
                feature_score += 4
                self.evidence['techno_economic'].append(
                    f"NLP: Excellent financial data - {len(money_entities)} money entities, {len(indian_currency)} Indian currency"
                )
            elif total_financial >= 8:
                feature_score += 3
                self.evidence['techno_economic'].append(f"NLP: Good financial data - {total_financial} financial figures")
            elif total_financial >= 3:
                feature_score += 2
                self.evidence['techno_economic'].append(f"NLP: Adequate financial data - {total_financial} figures")
            elif total_financial >= 1:
                feature_score += 1


            if len(cost_sentences) >= 5:
                feature_score += 3
                self.evidence['techno_economic'].append(f"NLP: Detailed cost analysis - {len(cost_sentences)} cost-related sentences")
            elif len(cost_sentences) >= 3:
                feature_score += 2
                self.evidence['techno_economic'].append(f"NLP: Good cost analysis - {len(cost_sentences)} sentences")
            elif len(cost_sentences) >= 1:
                feature_score += 1

            # Technical content (0-2)
            if len(tech_sentences) >= 5:
                feature_score += 2
                self.evidence['techno_economic'].append(f"NLP: Strong technical content - {len(tech_sentences)} technical sentences")
            elif len(tech_sentences) >= 2:
                feature_score += 1

            # Organizations mentioned (0-1)
            if len(set(orgs)) >= 3:
                feature_score += 1
                self.evidence['techno_economic'].append(f"NLP: Multiple organizations identified: {len(set(orgs))}")

            score = min(feature_score, 10)

        else:
            # Fallback
            score = self._fallback_techno_score()

        self.scores['techno_economic'] = score
        return score

    def _fallback_techno_score(self):
        """Fallback if NLP not available"""
        text = self.text.lower()
        cost_terms = ['cost', 'budget', 'estimate', 'expenditure']
        cost_count = sum(text.count(term) for term in cost_terms)

        indian_currency = len(re.findall(r'(?:rs|₹|crore|lakh)', text))

        score = min(cost_count // 3 + indian_currency // 5, 10)
        self.evidence['techno_economic'].append(f"Fallback: {cost_count} cost refs, {indian_currency} currency mentions")
        return max(score, 1) if indian_currency > 0 else 0

    # ==================== GATISHAKTI (WITH SEMANTIC SIMILARITY) ====================

    def score_gatishakti(self):
        """
        TRUE NLP: Uses semantic similarity to understand GatiShakti concepts
        """
        score = 0
        text_lower = self.text.lower()

        gatishakti_mentioned = bool(re.search(r'gati\s*shakti', text_lower))

        if SEMANTIC_AVAILABLE:

            gatishakti_concept = "PM GatiShakti National Master Plan multimodal infrastructure integration"


            sentences = self.text.split('.')
            sentences = [s.strip() for s in sentences if len(s.strip()) > 20][:100]

            if sentences:
                try:
                    concept_embedding = semantic_model.encode([gatishakti_concept])
                    sentence_embeddings = semantic_model.encode(sentences)

                    similarities = []
                    for sent_emb in sentence_embeddings:
                        norm_concept = np.linalg.norm(concept_embedding[0])
                        norm_sent = np.linalg.norm(sent_emb)
                        if norm_concept > 0 and norm_sent > 0:
                            sim = np.dot(concept_embedding[0], sent_emb) / (norm_concept * norm_sent)
                        else:
                            sim = 0
                        similarities.append(sim)

                    # Find sentences with high similarity
                    high_similarity = [s for s, sim in zip(sentences, similarities) if sim > 0.3]
                    avg_similarity = np.mean(similarities) if similarities else 0

                    # Scoring based on semantic understanding
                    if gatishakti_mentioned:
                        score = 5
                        self.evidence['gatishakti'].append(
                            "Semantic NLP: GatiShakti explicitly mentioned"
                        )
                    elif len(high_similarity) >= 5 or avg_similarity > 0.25:
                        score = 4
                        self.evidence['gatishakti'].append(
                            f"Semantic NLP: Strong alignment - {len(high_similarity)} relevant sentences (similarity: {avg_similarity:.2f})"
                        )
                    elif len(high_similarity) >= 2 or avg_similarity > 0.20:
                        score = 3
                        self.evidence['gatishakti'].append(
                            f"Semantic NLP: Good alignment - integration concepts present"
                        )
                    elif len(high_similarity) >= 1 or avg_similarity > 0.15:
                        score = 2
                        self.evidence['gatishakti'].append("Semantic NLP: Moderate alignment with GatiShakti principles")
                    else:
                        score = 1
                        self.evidence['gatishakti'].append("Semantic NLP: Minimal alignment")
                except Exception as e:
                    score = self._fallback_gatishakti_score()
            else:
                score = 0

        else:
            score = self._fallback_gatishakti_score()

        self.scores['gatishakti'] = score
        return score

    def _fallback_gatishakti_score(self):
        """Fallback if semantic model not available"""
        text = self.text.lower()

        gatishakti = 'gatishakti' in text or 'gati shakti' in text
        integration_terms = ['integration', 'connectivity', 'multimodal', 'clearance']
        integration_count = sum(1 for term in integration_terms if term in text)

        if gatishakti:
            score = 3
            self.evidence['gatishakti'].append("Fallback: GatiShakti mentioned")
        elif integration_count >= 2:
            score = 2
            self.evidence['gatishakti'].append(f"Fallback: {integration_count} integration concepts")
        elif integration_count >= 1:
            score = 1
            self.evidence['gatishakti'].append("Fallback: Basic integration concepts")
        else:
            score = 0
            self.evidence['gatishakti'].append("Fallback: No GatiShakti alignment")

        return score

    # ==================== TIMELINE (TRUE NLP) ====================

    def score_timeline(self):
        """
        TRUE NLP: Uses DATE entity recognition and semantic analysis
        """
        score = 0

        if NLP_AVAILABLE and self.doc:
            date_entities = [ent.text for ent in self.doc.ents if ent.label_ == 'DATE']

            time_entities = [ent.text for ent in self.doc.ents if ent.label_ == 'TIME']

            timeline_sentences = []
            for sent in self.doc.sents:
                sent_lower = sent.text.lower()
                if any(word in sent_lower for word in ['schedule', 'timeline', 'duration', 'period', 'phase', 'completion', 'deadline']):
                    timeline_sentences.append(sent.text)

            temporal_sentences = [sent for sent in self.doc.sents if any(ent.label_ in ['DATE', 'TIME'] for ent in sent.ents)]

            duration_pattern = r'\d+\s*(?:month|year)s?'
            durations = re.findall(duration_pattern, self.text.lower())

            timeline_score = 0

            if len(date_entities) >= 5 or len(durations) >= 5:
                timeline_score += 2.0
                self.evidence['timeline'].append(
                    f"NLP: Excellent temporal data - {len(date_entities)} dates, {len(durations)} durations"
                )
            elif len(date_entities) >= 2 or len(durations) >= 2:
                timeline_score += 1.5
                self.evidence['timeline'].append(f"NLP: Good temporal data - {len(date_entities)} dates")
            elif len(date_entities) >= 1 or len(durations) >= 1:
                timeline_score += 1.0
                self.evidence['timeline'].append("NLP: Basic temporal information")

            if len(timeline_sentences) >= 5:
                timeline_score += 2.0
                self.evidence['timeline'].append(f"NLP: Strong timeline structure - {len(timeline_sentences)} relevant sentences")
            elif len(timeline_sentences) >= 2:
                timeline_score += 1.0
                self.evidence['timeline'].append(f"NLP: Moderate timeline structure")

            # Temporal context (0-1)
            if len(temporal_sentences) >= 5:
                timeline_score += 1.0
                self.evidence['timeline'].append("NLP: Rich temporal context throughout document")
            elif len(temporal_sentences) >= 2:
                timeline_score += 0.5

            score = min(round(timeline_score * 2) / 2, 5)

            if date_entities:
                self.evidence['timeline'].append(f"Sample dates: {', '.join(date_entities[:3])}")

        else:
            # Fallback
            score = self._fallback_timeline_score()

        self.scores['timeline'] = score
        return score

    def _fallback_timeline_score(self):

        text = self.text.lower()

        timeline_terms = ['schedule', 'timeline', 'duration', 'period', 'completion', 'phase']
        timeline_count = sum(1 for term in timeline_terms if term in text)

        durations = len(re.findall(r'\d+\s*(?:month|year)', text))

        score = min((timeline_count // 2) + (durations // 2), 5)
        self.evidence['timeline'].append(f"Fallback: {timeline_count} timeline terms, {durations} durations")
        return max(score, 1) if timeline_count > 0 else 0

    # ==================== MAIN ANALYSIS ====================

    def analyze_dpr(self, pdf_path):

        try:
            if self.verbose:
                nlp_status = "WITH TRUE NLP" if (NLP_AVAILABLE or SEMANTIC_AVAILABLE) else "FALLBACK MODE"
                print(f"\n Analyzing: {os.path.basename(pdf_path)} {nlp_status}")
                print("="*70)

            if self.verbose:
                print("\n[1/5] Extracting text...")
            text = self.extract_text_from_pdf(pdf_path)

            if not text.strip():
                if self.verbose:
                    print(" No text extracted from PDF")
                return {
                    'pdf_path': pdf_path,
                    'error': 'Failed to extract text from PDF',
                    'total_score': 0,
                    'scores': {k: 0 for k in self.scores.keys()}
                }

            if self.verbose:
                print(f"  {len(text):,} characters extracted")
                if NLP_AVAILABLE:
                    print(f" NLP processing with spaCy")
                if SEMANTIC_AVAILABLE:
                    print(f" Semantic analysis available")

            # Score each dimension with error handling
            try:
                if self.verbose:
                    print("\n[2/5] Geo-coordinates (NER)...")
                self.score_geo_coordinates()
                if self.verbose:
                    print(f" Score: {self.scores['geo_coordinates']}/5")
            except Exception as e:
                if self.verbose:
                    print(f" Error in geo scoring: {str(e)[:50]}")
                self.scores['geo_coordinates'] = 0

            try:
                if self.verbose:
                    print("\n[3/5] Techno-economic (NER + Parsing)...")
                self.score_techno_economic()
                if self.verbose:
                    print(f" Score: {self.scores['techno_economic']}/10")
            except Exception as e:
                if self.verbose:
                    print(f"  Error in techno-economic scoring: {str(e)[:50]}")
                self.scores['techno_economic'] = 0

            try:
                if self.verbose:
                    print("\n[4/5] GatiShakti (Semantic)...")
                self.score_gatishakti()
                if self.verbose:
                    print(f" Score: {self.scores['gatishakti']}/5")
            except Exception as e:
                if self.verbose:
                    print(f" Error in GatiShakti scoring: {str(e)[:50]}")
                self.scores['gatishakti'] = 0

            try:
                if self.verbose:
                    print("\n[5/5] Timeline (DATE NER)...")
                self.score_timeline()
                if self.verbose:
                    print(f" Score: {self.scores['timeline']}/5")
            except Exception as e:
                if self.verbose:
                    print(f" Error in timeline scoring: {str(e)[:50]}")
                self.scores['timeline'] = 0

            total_score = sum(self.scores.values())
            percentage = round((total_score / 25) * 100, 1)

            if total_score >= 20:
                grade = "Excellent"
            elif total_score >= 17:
                grade = "Very Good"
            elif total_score >= 14:
                grade = "Good"
            elif total_score >= 11:
                grade = "Satisfactory"
            else:
                grade = "Needs Improvement"

            if self.verbose:
                print(f"\n      RESULT: {total_score}/25 ({percentage}%) - {grade}")

            return {
                'pdf_path': pdf_path,
                'scores': self.scores.copy(),
                'total_score': total_score,
                'percentage': percentage,
                'grade': grade,
                'evidence': self.evidence.copy(),
                'nlp_used': NLP_AVAILABLE,
                'semantic_used': SEMANTIC_AVAILABLE
            }

        except Exception as e:
            if self.verbose:
                print(f"  Fatal error: {str(e)[:100]}")
            return {
                'pdf_path': pdf_path,
                'error': f'Analysis failed: {str(e)}',
                'total_score': 0,
                'scores': {k: 0 for k in self.scores.keys()}
            }

    def generate_report(self, results):
        if 'error' in results:
            return f"\n ERROR: {results['error']}\n"

        report = "\n" + "="*70
        report += "\n       TRUE NLP DPR QUALITY SCORE REPORT"
        report += "\n" + "="*70 + "\n"

        report += f"\n File: {os.path.basename(results['pdf_path'])}"
        report += f"\n NLP: {'✓ spaCy' if results.get('nlp_used') else '✗'} | "
        report += f"Semantic: {'✓ Transformers' if results.get('semantic_used') else '✗'}\n"

        report += "\n" + "-"*70 + "\n"
        scores = results['scores']
        report += f"  Geo-coordinates:    {scores['geo_coordinates']}/5\n"
        report += f"  Techno-economic:    {scores['techno_economic']}/10\n"
        report += f"  GatiShakti:         {scores['gatishakti']}/5\n"
        report += f"  Timeline:           {scores['timeline']}/5\n"
        report += "-"*70 + "\n"
        report += f"  TOTAL:              {results['total_score']}/25 ({results['percentage']}%)\n"
        report += f"  GRADE:              {results['grade']}\n"
        report += "="*70 + "\n"

        report += "\n" + "-"*70
        report += "\n                    NLP FINDINGS"
        report += "\n" + "-"*70 + "\n"

        for component, evidences in results['evidence'].items():
            report += f"\n{component.upper().replace('_', ' ')}:\n"
            for ev in evidences:
                report += f"  • {ev}\n"

        report += "\n" + "="*70 + "\n"

        return report


def analyze_multiple_dprs(pdf_paths, verbose=True, export_csv=None):
    # Handle directory path
    if isinstance(pdf_paths, str) and os.path.isdir(pdf_paths):
        directory = pdf_paths
        pdf_paths = [
            os.path.join(directory, f)
            for f in os.listdir(directory)
            if f.lower().endswith('.pdf')
        ]
        if verbose:
            print(f"\n Found {len(pdf_paths)} PDF files")

    results_list = []

    if verbose:
        print("\n" + "="*70)
        print(f"  BATCH ANALYSIS: {len(pdf_paths)} DPR(s)")
        print("="*70)

    for i, pdf_path in enumerate(pdf_paths, 1):
        if verbose:
            print(f"\n{'='*70}")
            print(f"  DPR {i}/{len(pdf_paths)}")
            print(f"{'='*70}")

        try:
            analyzer = TrueNLPDPRAnalyzer(verbose=verbose)
            results = analyzer.analyze_dpr(pdf_path)
            results_list.append(results)

            if verbose and 'error' not in results:
                print(f"\n  Score: {results['total_score']}/25 ({results['percentage']}%) - {results['grade']}")
        except Exception as e:
            if verbose:
                print(f"\n  Error: {str(e)[:100]}")
            results_list.append({
                'pdf_path': pdf_path,
                'error': str(e),
                'total_score': 0,
                'scores': {'geo_coordinates': 0, 'techno_economic': 0, 'gatishakti': 0, 'timeline': 0}
            })

    return results_list


def calculate_dpr_score(pdf_path, verbose=True):
    """Calculate score for single DPR"""
    analyzer = TrueNLPDPRAnalyzer(verbose=verbose)
    results = analyzer.analyze_dpr(pdf_path)

    if verbose:
        report = analyzer.generate_report(results)
        print(report)

    return results

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
import csv
import os

def export_results_to_csv(results_list, output_file='dpr_analysis.csv'):

    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)

        # Header
        writer.writerow([
            'Filename', 'Total_Score', 'Percentage', 'Grade',
            'Geo_Score', 'TechnoEconomic_Score', 'GatiShakti_Score', 'Timeline_Score', 'Status'
        ])

        # Data rows
        for result in results_list:
            if 'error' not in result:
                s = result['scores']
                writer.writerow([
                    os.path.basename(result.get('pdf_path', 'N/A')),
                    result['total_score'],
                    result['percentage'],
                    result['grade'],
                    s['geo_coordinates'],
                    s['techno_economic'],
                    s['gatishakti'],
                    s['timeline'],
                    'Success'
                ])
            else:
                writer.writerow([
                    os.path.basename(result.get('pdf_path', 'N/A')),
                    0, 0, 'ERROR', 0, 0, 0, 0, 'Failed'
                ])


def generate_comparison_report(results_list, output_file='dpr_report.txt'):
    """Generate detailed text comparison report"""
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write(" "*20 + "DPR QUALITY COMPARISON REPORT\n")
        f.write("="*80 + "\n\n")

        # Summary statistics
        valid_results = [r for r in results_list if 'error' not in r]
        if valid_results:
            scores = [r['total_score'] for r in valid_results]
            avg_score = sum(scores) / len(scores)

            f.write("SUMMARY STATISTICS:\n")
            f.write("-"*80 + "\n")
            f.write(f"Total DPRs Analyzed:     {len(results_list)}\n")
            f.write(f"Successfully Processed:  {len(valid_results)}\n")
            f.write(f"Average Score:           {avg_score:.2f}/25 ({(avg_score/25)*100:.1f}%)\n")
            f.write(f"Highest Score:           {max(scores)}/25\n")
            f.write(f"Lowest Score:            {min(scores)}/25\n")
            f.write("\n")

        # Individual DPR details
        f.write("="*80 + "\n")
        f.write("INDIVIDUAL DPR ANALYSIS:\n")
        f.write("="*80 + "\n\n")

        for i, result in enumerate(results_list, 1):
            f.write(f"\n{'='*80}\n")
            f.write(f"DPR #{i}: {os.path.basename(result.get('pdf_path', 'Unknown'))}\n")
            f.write(f"{'='*80}\n\n")

            if 'error' not in result:
                s = result['scores']
                f.write(f"Overall Score:    {result['total_score']}/25 ({result['percentage']}%)\n")
                f.write(f"Grade:            {result['grade']}\n\n")

                f.write("Component Scores:\n")
                f.write(f"  • Geo-coordinates:    {s['geo_coordinates']}/5\n")
                f.write(f"  • Techno-economic:    {s['techno_economic']}/10\n")
                f.write(f"  • GatiShakti:         {s['gatishakti']}/5\n")
                f.write(f"  • Timeline:           {s['timeline']}/5\n\n")

                if result['evidence']:
                    f.write("Evidence:\n")
                    for component, evidences in result['evidence'].items():
                        f.write(f"\n{component.upper().replace('_', ' ')}:\n")
                        for ev in evidences:
                            f.write(f"  • {ev}\n")
            else:
                f.write(f"ERROR: {result.get('error')}\n")

            f.write("\n")




In [None]:
def get_tech_score(pdf_path):
    """
    Returns the total technical quality score (0-25) for a DPR PDF.

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        float: Total score out of 25, or None if analysis fails
    """
    try:
        analyzer = TrueNLPDPRAnalyzer(verbose=False)
        results = analyzer.analyze_dpr(pdf_path)

        if 'error' in results:
            return None

        return results['total_score']

    except Exception as e:
        return None




In [None]:
def detect_missing_dpr_components(pdf_path, verbose=True):
    """
    Detects which DPR quality components are missing.
    Returns a dict with missing components and their details.

    Args:
        pdf_path: Path to the PDF file to analyze
        verbose: Print detailed output

    Returns:
        {
            'missing_components': [...],
            'missing_count': int,
            'total_possible_points': 25,
            'obtainable_points': int,
            'analysis_results': {detailed scores and evidence}
        }
    """
    analyzer = TrueNLPDPRAnalyzer(verbose=verbose)
    results = analyzer.analyze_dpr(pdf_path)

    if 'error' in results:
        return {
            'error': results['error'],
            'missing_components': ['Unable to analyze - extraction failed'],
            'pdf_path': pdf_path
        }

    missing_components = []
    obtainable_points = 0

    # Component 1: Geo-coordinates (5 pts)
    geo_score = results['scores']['geo_coordinates']
    if geo_score == 0:
        missing_components.append({
            'component': 'Geo-coordinates',
            'max_points': 5,
            'current_score': 0,
            'status': 'MISSING',
            'evidence': results['evidence']['geo_coordinates']
        })
    else:
        obtainable_points += 5

    # Component 2: Techno-economic vetting (10 pts)
    techno_score = results['scores']['techno_economic']
    if techno_score == 0:
        missing_components.append({
            'component': 'Techno-economic vetting',
            'max_points': 10,
            'current_score': 0,
            'status': 'MISSING',
            'evidence': results['evidence']['techno_economic']
        })
    else:
        obtainable_points += 10

    # Component 3: GatiShakti alignment (5 pts)
    gati_score = results['scores']['gatishakti']
    if gati_score == 0:
        missing_components.append({
            'component': 'GatiShakti alignment',
            'max_points': 5,
            'current_score': 0,
            'status': 'MISSING',
            'evidence': results['evidence']['gatishakti']
        })
    else:
        obtainable_points += 5

    # Component 4: Realistic timeline (5 pts)
    timeline_score = results['scores']['timeline']
    if timeline_score == 0:
        missing_components.append({
            'component': 'Realistic timeline',
            'max_points': 5,
            'current_score': 0,
            'status': 'MISSING',
            'evidence': results['evidence']['timeline']
        })
    else:
        obtainable_points += 5

    # Build return object
    return {
        'pdf_path': pdf_path,
        'missing_components': missing_components,
        'missing_count': len(missing_components),
        'total_possible_points': 25,
        'obtainable_points': obtainable_points,
        'points_at_risk': 25 - obtainable_points,
        'analysis_results': results,
        'summary': {
            'geo_coordinates': {
                'score': geo_score,
                'max': 5,
                'status': 'Present' if geo_score > 0 else 'MISSING'
            },
            'techno_economic': {
                'score': techno_score,
                'max': 10,
                'status': 'Present' if techno_score > 0 else 'MISSING'
            },
            'gatishakti': {
                'score': gati_score,
                'max': 5,
                'status': 'Present' if gati_score > 0 else 'MISSING'
            },
            'timeline': {
                'score': timeline_score,
                'max': 5,
                'status': 'Present' if timeline_score > 0 else 'MISSING'
            }
        }
    }


def print_missing_components_report(detection_results):
    """
    Prints a formatted report of missing DPR components.

    Args:
        detection_results: Output from detect_missing_dpr_components()
    """

    if 'error' in detection_results:
        print(f"\n Analysis Error: {detection_results['error']}\n")
        return

    print("\n" + "="*70)
    print("       DPR MISSING COMPONENTS ANALYSIS")
    print("="*70)

    print(f"\n File: {detection_results['pdf_path']}")

    missing_count = detection_results['missing_count']

    if missing_count == 0:
        print("\n All components present!")
        print(f"\n   Total Points Available: {detection_results['obtainable_points']}/25")
    else:
        print(f"\n  MISSING COMPONENTS: {missing_count}")
        print(f"   Points at Risk: {detection_results['points_at_risk']}/25")
        print(f"   Maximum Obtainable: {detection_results['obtainable_points']}/25\n")

        print("-"*70)
        for i, component in enumerate(detection_results['missing_components'], 1):
            print(f"\n{i}. {component['component'].upper()}")
            print(f"   Max Points: {component['max_points']}")
            print(f"   Status: {component['status']}")
            if component['evidence']:
                print(f"   Details:")
                for evidence in component['evidence']:
                    print(f"     • {evidence}")

    print("\n" + "-"*70)
    print("COMPONENT STATUS SUMMARY:")
    print("-"*70)

    summary = detection_results['summary']
    for comp_name, comp_data in summary.items():
        status_icon = "✓" if comp_data['status'] == 'Present' else "✗"
        print(f"  {status_icon} {comp_name.replace('_', ' ').title()}: "
              f"{comp_data['score']}/{comp_data['max']} - {comp_data['status']}")

    print("\n" + "="*70 + "\n")

SUSTAINABILTY SCORE

In [None]:
"""
Enhanced Impact & Sustainability Score Calculator for DPR Analysis
Includes proper PyMuPDF handling and fallback options
"""

from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
from typing import Dict, Any, Tuple, List
import re
import logging
from dataclasses import dataclass
import json
from datetime import datetime

try:
    import pymupdf as fitz
    PYMUPDF_AVAILABLE = True
except ImportError:
    try:
        import fitz
        PYMUPDF_AVAILABLE = True
    except ImportError:
        PYMUPDF_AVAILABLE = False
        print("⚠️ PyMuPDF not available. Install with: pip install pymupdf")


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- 1. Enhanced Scoring Configuration ---

SCORE_WEIGHTS = {
    "Clear beneficiary identification": 5,
    "SDG/MPI alignment": 5,
    "O&M plan for 4+ years": 5,
    "Output-outcome framework with KPIs": 5,
    "TOTAL_IMPACT": 20,
}

# Enhanced semantic patterns with more diverse examples
IMPACT_CRITERIA_SEMANTICS = {
    "Clear beneficiary identification": [
        "direct beneficiaries identified as households families individuals",
        "target population specified demographic groups communities",
        "socio-economic impact assessment on target beneficiaries",
        "primary stakeholders end-users clearly defined identified",
        "beneficiary analysis demographic profile vulnerable groups",
        "intended recipients project participants user groups",
        "affected population target communities specific groups identified",
        "number of beneficiaries households families receiving benefits"
    ],
    "SDG/MPI alignment": [
        "aligned with Sustainable Development Goals SDG targets",
        "contributes to reducing Multidimensional Poverty Index MPI",
        "supports achievement of SDG indicators targets",
        "poverty reduction sustainable development alignment",
        "environmental social governance ESG alignment SDG",
        "UN sustainable development agenda SDG framework",
        "poverty alleviation multidimensional deprivation indicators",
        "SDG goal target alignment contribution sustainability"
    ],
    "O&M plan for 4+ years": [
        "Operations and Maintenance plan for minimum 4 5 years",
        "O&M budget allocated post-construction maintenance phase",
        "long-term sustainability maintenance strategy model",
        "operational costs maintenance schedule 4 years onwards",
        "maintenance plan sustainable operations 4 5 year period",
        "post-implementation O&M framework multi-year plan",
        "operations maintenance provisions 4 years minimum",
        "annual maintenance budget operational expenses multi-year"
    ],
    "Output-outcome framework with KPIs": [
        "Output-Outcome Monitoring Framework defined established",
        "Key Performance Indicators KPIs metrics defined success",
        "logical framework log-frame matrix verifiable indicators",
        "results framework monitoring evaluation indicators",
        "performance metrics outcome indicators measurable targets",
        "KPI dashboard monitoring framework outcome measurement",
        "measurable indicators targets outputs outcomes results chain",
        "monitoring evaluation framework performance measurement KPI"
    ]
}

# Additional keyword patterns for rule-based checking
KEYWORD_PATTERNS = {
    "Clear beneficiary identification": [
        r'beneficiar(?:y|ies)',
        r'target\s+(?:population|group|community|households?)',
        r'stakeholders?',
        r'end[- ]users?',
        r'\d+\s+(?:households?|families|individuals|people|persons|beneficiaries)',
        r'vulnerable\s+groups?',
        r'primary\s+recipients?',
        r'direct\s+beneficiaries',
        r'intended\s+recipients?'
    ],
    "SDG/MPI alignment": [
        r'SDG[s]?\s*(?:\d+)?',
        r'Sustainable\s+Development\s+Goals?',
        r'MPI',
        r'Multidimensional\s+Poverty\s+Index',
        r'poverty\s+(?:reduction|alleviation)',
        r'UN\s+(?:agenda|goals|targets)',
        r'SDG\s+(?:alignment|contribution|target)',
        r'goal\s+\d+',
        r'sustainable\s+development'
    ],
    "O&M plan for 4+ years": [
        r'O\s*&\s*M\s+(?:plan|budget|provisions?)',
        r'Operations?\s+(?:and|&|\+)\s+Maintenance',
        r'maintenance\s+(?:plan|strategy|schedule|budget)',
        r'(?:4|5|6|four|five|six)\s*\+?\s*years?\s+(?:plan|O&M|maintenance)',
        r'post[- ]construction\s+maintenance',
        r'long[- ]term\s+(?:maintenance|sustainability|operations?)',
        r'operational\s+costs?\s+\d+\s+years?',
        r'annual\s+maintenance',
        r'O&M\s+budget'
    ],
    "Output-outcome framework with KPIs": [
        r'KPI[s]?',
        r'Key\s+Performance\s+Indicators?',
        r'(?:output|outcome)[- ]framework',
        r'log[- ]?frame',
        r'logical\s+framework',
        r'monitoring\s+(?:and|&|\+)\s+evaluation',
        r'M\s*&\s*E\s+framework',
        r'performance\s+(?:metrics?|indicators?)',
        r'results?\s+framework',
        r'outcome\s+indicators?',
        r'measurable\s+indicators?',
        r'performance\s+measurement'
    ]
}

@dataclass
class ImpactScoreResult:
    """Data class to store impact & sustainability score results"""
    criterion: str
    score: float
    max_score: float
    status: str
    confidence: float
    semantic_match: bool
    keyword_match: bool
    evidence: List[str]

class ImpactSustainabilityAnalyzer:
    """Enhanced analyzer for Impact & Sustainability scoring with multiple validation methods"""

    def __init__(self, model_name: str = 'intfloat/e5-large-v2'):
        """Initialize the analyzer with sentence transformer model"""
        logger.info(f"Loading sentence transformer model: {model_name}")
        try:
            self.model = SentenceTransformer(model_name)
            logger.info("✅ Model loaded successfully")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            logger.info("Falling back to lighter model: all-MiniLM-L6-v2")
            self.model = SentenceTransformer('all-MiniLM-L6-v2')

        # Pre-encode all criteria examples
        self.criteria_embeddings = self._precompute_embeddings()

    def _precompute_embeddings(self) -> Dict[str, Any]:
        """Pre-compute embeddings for all criteria examples"""
        logger.info("Pre-computing embeddings for criteria examples...")
        embeddings = {}
        for criterion, examples in IMPACT_CRITERIA_SEMANTICS.items():
            embeddings[criterion] = self.model.encode(examples, convert_to_tensor=True)
        logger.info("✅ Embeddings computed successfully")
        return embeddings

    def extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text content from PDF file using PyMuPDF"""
        if not PYMUPDF_AVAILABLE:
            error_msg = "❌ ERROR: PyMuPDF not installed. Install with: pip install pymupdf"
            logger.error(error_msg)
            raise ImportError("PyMuPDF library is required but not installed. Run: pip install pymupdf")

        # Check if file exists
        import os
        if not os.path.exists(pdf_path):
            error_msg = f"❌ ERROR: PDF file not found: {pdf_path}"
            logger.error(error_msg)
            raise FileNotFoundError(f"PDF file does not exist: {pdf_path}")

        try:
            text = ""
            with fitz.open(pdf_path) as doc:
                logger.info(f"📄 Extracting text from {pdf_path} ({len(doc)} pages)")

                if len(doc) == 0:
                    raise ValueError(f"PDF file has 0 pages: {pdf_path}")

                for page_num, page in enumerate(doc, 1):
                    page_text = page.get_text("text")
                    text += page_text + "\n"
                    if page_num % 10 == 0:
                        logger.debug(f"  Processed {page_num} pages")

            if not text.strip():
                raise ValueError(f"No text content extracted from PDF. PDF may contain only images or be empty.")

            logger.info(f"✅ Successfully extracted {len(text)} characters from PDF")
            return text

        except Exception as e:
            error_msg = f"❌ Error extracting text from PDF: {str(e)}"
            logger.error(error_msg)
            logger.error("Possible causes:")
            logger.error("  1. PDF is password-protected")
            logger.error("  2. PDF is corrupted")
            logger.error("  3. PDF contains only scanned images (requires OCR)")
            logger.error("  4. Insufficient permissions to read the file")
            raise RuntimeError(f"Failed to extract text from {pdf_path}: {str(e)}") from e

    def preprocess_text(self, text: str) -> List[str]:
        """Preprocess text into meaningful sentences for analysis"""
        # Split by common sentence boundaries
        sentences = re.split(r'[.!?\n]+', text)

        # Clean and filter sentences
        processed = []
        for sent in sentences:
            sent = sent.strip()
            # Keep sentences with minimum length and meaningful content
            if len(sent) >= 20 and len(sent.split()) >= 4:
                processed.append(sent)

        logger.info(f"✅ Preprocessed into {len(processed)} meaningful sentences")
        return processed

    def check_semantic_match(
        self,
        text_embeddings,
        criterion_embeddings,
        threshold: float = 0.50
    ) -> Tuple[bool, float, List[int]]:
        """
        Check semantic similarity between text and criterion examples

        Returns:
            Tuple of (match_found, max_score, matching_indices)
        """
        cosine_scores = util.cos_sim(text_embeddings, criterion_embeddings)
        max_scores_per_sentence = cosine_scores.max(dim=1).values
        max_score = float(max_scores_per_sentence.max())

        # Find all sentences above threshold
        matching_indices = [
            i for i, score in enumerate(max_scores_per_sentence)
            if score > threshold
        ]

        return max_score > threshold, max_score, matching_indices

    def check_keyword_match(self, text: str, patterns: List[str]) -> Tuple[bool, List[str]]:
        """
        Check for keyword patterns in text using regex

        Returns:
            Tuple of (match_found, matched_patterns)
        """
        text_lower = text.lower()
        matched = []

        for pattern in patterns:
            if re.search(pattern, text_lower, re.IGNORECASE):
                matched.append(pattern)

        return len(matched) > 0, matched

    def extract_evidence(
        self,
        sentences: List[str],
        matching_indices: List[int],
        keyword_matches: List[str],
        max_evidence: int = 3
    ) -> List[str]:
        """Extract evidence sentences that support the scoring"""
        evidence = []

        # Add sentences from semantic matching
        for idx in matching_indices[:max_evidence]:
            if idx < len(sentences):
                evidence.append(sentences[idx][:200])  # Truncate long sentences

        return evidence

    def analyze_criterion(
        self,
        criterion: str,
        text: str,
        sentences: List[str],
        text_embeddings,
        semantic_threshold: float = 0.50
    ) -> ImpactScoreResult:
        """Analyze a single criterion using multiple validation methods"""

        max_score = SCORE_WEIGHTS[criterion]

        # Method 1: Semantic similarity check
        criterion_embeddings = self.criteria_embeddings[criterion]
        semantic_match, semantic_score, matching_indices = self.check_semantic_match(
            text_embeddings, criterion_embeddings, semantic_threshold
        )

        # Method 2: Keyword pattern matching
        patterns = KEYWORD_PATTERNS[criterion]
        keyword_match, keyword_patterns = self.check_keyword_match(text, patterns)

        # Scoring logic with confidence
        if semantic_match and keyword_match:
            score = max_score
            status = "✅ Pass - Strong Evidence (Semantic + Keywords)"
            confidence = min(0.95, (semantic_score + 0.45))
        elif semantic_match:
            score = max_score * 0.8  # 80% if only semantic match
            status = "✅ Pass - Semantic Match"
            confidence = semantic_score
        elif keyword_match and len(keyword_patterns) >= 2:
            score = max_score * 0.6  # 60% if multiple keywords
            status = "⚠️ Partial - Keywords Found"
            confidence = 0.60
        elif keyword_match:
            score = max_score * 0.3  # 30% if single keyword
            status = "⚠️ Weak - Single Keyword"
            confidence = 0.40
        else:
            score = 0
            status = "❌ Fail - No Evidence Found"
            confidence = 0.0

        # Extract evidence
        evidence = self.extract_evidence(sentences, matching_indices, keyword_patterns)

        return ImpactScoreResult(
            criterion=criterion,
            score=score,
            max_score=max_score,
            status=status,
            confidence=confidence,
            semantic_match=semantic_match,
            keyword_match=keyword_match,
            evidence=evidence
        )

    def calculate_impact_sustainability_score(
        self,
        pdf_path: str = None,
        text: str = None,
        semantic_threshold: float = 0.50
    ) -> Tuple[float, Dict[str, ImpactScoreResult]]:
        """
        Main method to calculate complete Impact & Sustainability score

        Args:
            pdf_path: Path to the DPR PDF file (optional if text provided)
            text: Direct text input (optional if pdf_path provided)
            semantic_threshold: Threshold for semantic similarity (0.0 to 1.0)

        Returns:
            Tuple of (total_score, detailed_results_dict)
        """
        logger.info(f"\n{'='*70}")
        logger.info(f"🚀 Starting Impact & Sustainability Analysis")
        if pdf_path:
            logger.info(f"📄 Document: {pdf_path}")
        logger.info(f"{'='*70}")

        # Extract and preprocess text
        if text is None:
            if pdf_path is None:
                raise ValueError("Either pdf_path or text must be provided")
            text = self.extract_text_from_pdf(pdf_path)

        if not text or len(text.strip()) < 100:
            error_msg = f"❌ ERROR: Could not extract text from PDF: {pdf_path}"
            logger.error(error_msg)
            logger.error("Possible causes:")
            logger.error("  1. PDF file does not exist at the specified path")
            logger.error("  2. PDF file is corrupted or password-protected")
            logger.error("  3. PDF contains only images (requires OCR)")
            logger.error("  4. PyMuPDF is not properly installed")
            raise ValueError(f"Failed to extract text from PDF: {pdf_path}. File may be missing, corrupted, or contain only images.")

        sentences = self.preprocess_text(text)
        if not sentences:
            error_msg = f"❌ ERROR: No valid sentences found in PDF: {pdf_path}"
            logger.error(error_msg)
            logger.error("The PDF may contain insufficient text content or only images")
            raise ValueError(f"No analyzable content found in PDF: {pdf_path}")

        # Encode sentences
        logger.info("🔄 Encoding text sentences...")
        text_embeddings = self.model.encode(
            sentences,
            batch_size=32,
            convert_to_tensor=True,
            show_progress_bar=False
        )

        # Analyze each criterion
        results = {}
        total_score = 0.0

        criteria = [k for k in SCORE_WEIGHTS.keys() if k != "TOTAL_IMPACT"]

        for criterion in criteria:
            logger.info(f"\n📊 Analyzing: {criterion}")
            result = self.analyze_criterion(
                criterion, text, sentences, text_embeddings, semantic_threshold
            )
            results[criterion] = result
            total_score += result.score

            logger.info(f"  Score: {result.score:.1f}/{result.max_score}")
            logger.info(f"  Status: {result.status}")
            logger.info(f"  Confidence: {result.confidence:.2%}")

        logger.info(f"\n{'='*70}")
        logger.info(f"📈 TOTAL IMPACT & SUSTAINABILITY SCORE: {total_score:.1f}/{SCORE_WEIGHTS['TOTAL_IMPACT']}")
        logger.info(f"{'='*70}")

        return total_score, results

    def _get_demo_text(self) -> str:
        """Return demonstration text for testing when PDF is unavailable"""
        return """
        Project Title: Rural Healthcare Infrastructure Development

        Beneficiary Identification:
        The project will directly benefit 5,000 households across 15 villages in the district.
        Primary beneficiaries include rural families, women, children, and elderly population.
        Target population consists of approximately 25,000 individuals with limited healthcare access.
        Vulnerable groups including below poverty line families have been specifically identified.

        SDG Alignment:
        This project is aligned with Sustainable Development Goal 3 (Good Health and Well-being).
        The initiative contributes to SDG Goal 1 (No Poverty) by improving healthcare accessibility.
        Project outcomes support reduction in Multidimensional Poverty Index (MPI) indicators.
        Alignment with SDG 5 (Gender Equality) through focused maternal health services.

        Operations and Maintenance Plan:
        A comprehensive O&M plan for 5 years has been developed and budgeted.
        Annual maintenance budget of Rs. 2.5 crores allocated for sustainable operations.
        Operations and Maintenance framework includes staffing, equipment, and facility upkeep.
        Long-term sustainability ensured through state government commitment for 5+ years.
        Post-construction maintenance schedule with quarterly reviews established.

        Monitoring Framework:
        Detailed Output-Outcome Monitoring Framework with Key Performance Indicators defined.
        KPIs include: patient visits per month, reduction in infant mortality, immunization coverage.
        Logical framework matrix with measurable indicators and verifiable targets provided.
        Performance metrics for healthcare delivery and quality outcomes established.
        Quarterly monitoring and evaluation system with M&E framework operational.
        Results framework tracks outputs, outcomes, and long-term impact indicators.
        """

    def generate_detailed_report(
        self,
        pdf_path: str,
        total_score: float,
        results: Dict[str, ImpactScoreResult]
    ) -> str:
        """Generate a comprehensive formatted report"""

        output = "\n" + "="*70 + "\n"
        output += f"  📋 IMPACT & SUSTAINABILITY SCORE REPORT\n"
        output += f"  Document: {pdf_path}\n"
        output += f"  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
        output += "="*70 + "\n"
        output += f"  🎯 TOTAL SCORE: {total_score:.1f} / {SCORE_WEIGHTS['TOTAL_IMPACT']}\n"
        output += f"  📊 Percentage: {(total_score/SCORE_WEIGHTS['TOTAL_IMPACT'])*100:.1f}%\n"
        output += "-"*70 + "\n\n"

        for criterion, result in results.items():
            # Status indicator
            if result.score >= result.max_score * 0.8:
                indicator = "✅"
            elif result.score >= result.max_score * 0.5:
                indicator = "⚠️"
            else:
                indicator = "❌"

            output += f"{indicator} {criterion}\n"
            output += f"   Score: {result.score:.1f} / {result.max_score} pts\n"
            output += f"   Status: {result.status}\n"
            output += f"   Confidence: {result.confidence:.2%}\n"
            output += f"   Semantic Match: {'Yes ✓' if result.semantic_match else 'No ✗'} | "
            output += f"Keyword Match: {'Yes ✓' if result.keyword_match else 'No ✗'}\n"

            if result.evidence:
                output += f"   📝 Evidence:\n"
                for i, ev in enumerate(result.evidence, 1):
                    output += f"     {i}. {ev}...\n"

            output += "\n"

        # Recommendations
        output += "-"*70 + "\n"
        output += "💡 RECOMMENDATIONS:\n"
        output += "-"*70 + "\n"

        has_recommendations = False
        for criterion, result in results.items():
            if result.score < result.max_score * 0.8:
                has_recommendations = True
                output += f"• {criterion}: "
                if result.score == 0:
                    output += "❌ Missing - Add comprehensive details\n"
                elif result.score < result.max_score * 0.5:
                    output += "⚠️ Insufficient - Strengthen documentation\n"
                else:
                    output += "⚠️ Weak - Enhance with more specific details\n"

        if not has_recommendations:
            output += "✅ No major issues identified\n"

        output += "\n"

        if total_score >= 18:
            output += "✨ OVERALL: Excellent - Ready for approval\n"
        elif total_score >= 14:
            output += "✓ OVERALL: Good - Minor improvements recommended\n"
        elif total_score >= 10:
            output += "⚠ OVERALL: Adequate - Moderate improvements needed\n"
        else:
            output += "❌ OVERALL: Poor - Substantial revision required\n"

        output += "="*70 + "\n"

        return output

    def export_results_to_json(
        self,
        pdf_path: str,
        total_score: float,
        results: Dict[str, ImpactScoreResult],
        output_path: str
    ):
        """Export results to JSON format"""
        export_data = {
            "document": pdf_path,
            "timestamp": datetime.now().isoformat(),
            "total_score": round(total_score, 2),
            "max_score": SCORE_WEIGHTS['TOTAL_IMPACT'],
            "percentage": round((total_score / SCORE_WEIGHTS['TOTAL_IMPACT']) * 100, 2),
            "criteria": {
                criterion: {
                    "score": round(result.score, 2),
                    "max_score": result.max_score,
                    "status": result.status,
                    "confidence": round(result.confidence, 3),
                    "semantic_match": result.semantic_match,
                    "keyword_match": result.keyword_match,
                    "evidence": result.evidence
                }
                for criterion, result in results.items()
            }
        }

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)

        logger.info(f"✅ Results exported to {output_path}")


def analyze_multiple_dprs(file_paths: List[str], output_csv: str = "impact_scores.csv"):
    """Analyze multiple DPR files and generate comparative summary"""

    analyzer = ImpactSustainabilityAnalyzer()
    all_results = []
    errors = []

    for pdf_path in file_paths:
        try:
            logger.info(f"\n{'='*70}")
            logger.info(f"Processing: {pdf_path}")
            logger.info(f"{'='*70}")

            # Calculate scores
            total_score, results = analyzer.calculate_impact_sustainability_score(pdf_path=pdf_path)

            # Generate and print report
            report = analyzer.generate_detailed_report(pdf_path, total_score, results)
            print(report)

            # Export JSON
            json_path = pdf_path.replace('.pdf', '_impact_analysis.json')
            analyzer.export_results_to_json(pdf_path, total_score, results, json_path)

            # Store for DataFrame
            row = {
                "DPR": pdf_path,
                "Total Score": round(total_score, 1),
                "Percentage": f"{(total_score/20)*100:.1f}%",
                "Status": "✅ Success"
            }

            for criterion, result in results.items():
                row[f"{criterion} (Score)"] = round(result.score, 1)
                row[f"{criterion} (Status)"] = "✓" if result.score >= result.max_score * 0.8 else "✗"

            all_results.append(row)

        except FileNotFoundError as e:
            error_msg = f"❌ File Not Found: {pdf_path}"
            logger.error(error_msg)
            errors.append({"DPR": pdf_path, "Error": "File not found"})
            print(f"\n{error_msg}\n")

        except ValueError as e:
            error_msg = f"❌ Invalid PDF: {pdf_path} - {str(e)}"
            logger.error(error_msg)
            errors.append({"DPR": pdf_path, "Error": str(e)})
            print(f"\n{error_msg}\n")

        except RuntimeError as e:
            error_msg = f"❌ Processing Error: {pdf_path} - {str(e)}"
            logger.error(error_msg)
            errors.append({"DPR": pdf_path, "Error": str(e)})
            print(f"\n{error_msg}\n")

        except Exception as e:
            error_msg = f"❌ Unexpected Error: {pdf_path}"
            logger.error(error_msg)
            logger.error(f"Details: {str(e)}")
            errors.append({"DPR": pdf_path, "Error": f"Unexpected error: {str(e)}"})
            import traceback
            traceback.print_exc()
            print(f"\n{error_msg}\n")

    # Create summary DataFrame
    if all_results:
        df = pd.DataFrame(all_results)
        df.to_csv(output_csv, index=False)

        print("\n" + "="*70)
        print("📊 COMPARATIVE SUMMARY - SUCCESSFUL ANALYSES")
        print("="*70)
        print(df.to_string(index=False))
        print(f"\n✅ Summary exported to: {output_csv}")
    else:
        print("\n" + "="*70)
        print("❌ NO SUCCESSFUL ANALYSES")
        print("="*70)
        print("All PDF files failed to process. See errors below.")

    # Display errors summary
    if errors:
        print("\n" + "="*70)
        print("⚠️ ERRORS ENCOUNTERED")
        print("="*70)
        error_df = pd.DataFrame(errors)
        print(error_df.to_string(index=False))

        # Export errors to CSV
        error_csv = "impact_scores_errors.csv"
        error_df.to_csv(error_csv, index=False)
        print(f"\n❌ Errors exported to: {error_csv}")

    return all_results, errors




In [None]:
def get_sustainability_score(pdf_path: str) -> float:
    """
    Analyzes a DPR PDF and returns only the total Impact & Sustainability Score.

    Instantiates the ImpactSustainabilityAnalyzer and calls the core calculation
    method, suppressing the detailed report printing to only return the final score.

    Args:
        pdf_path: Path to the DPR PDF file.

    Returns:
        The total Impact & Sustainability Score as a float (out of 20.0),
        or 0.0 if a critical error occurs during processing.
    """
    # Temporarily capture the logger output level to manage verbosity
    original_level = logging.getLogger().level
    try:
        # Set logging level higher (e.g., to WARNING) to suppress INFO and DEBUG
        # messages from the analyzer, keeping the output clean.
        logging.getLogger().setLevel(logging.WARNING)

        # 1. Initialize the Analyzer (model loading log will still show unless
        # its internal logger is also managed, but we proceed with the main goal)
        analyzer = ImpactSustainabilityAnalyzer()

        # 2. Calculate scores
        # The analyzer's internal logging for the process will run, but not print
        # unless it is an ERROR or higher due to the temporary log level change.
        total_score, _ = analyzer.calculate_impact_sustainability_score(
            pdf_path=pdf_path
        )

        return total_score

    except (FileNotFoundError, ValueError, RuntimeError, ImportError) as e:
        # Handle all expected critical exceptions gracefully
        # Log the error at a higher level that will definitely show if needed,
        # but the function will still return 0.0 as requested for clean output.
        logger.error(f"Critical error processing {pdf_path}: {str(e)}", exc_info=True)
        return 0.0
    except Exception as e:
        # Handle any unexpected error
        logger.error(f"An unexpected error occurred during analysis of {pdf_path}: {str(e)}", exc_info=True)
        return 0.0
    finally:
        # IMPORTANT: Restore the original logging level to avoid affecting
        # other parts of the calling application.
        logging.getLogger().setLevel(original_level)

# Example Usage (assuming you have a dummy PDF or use the internal demo text):
#
# Since the full program cannot access a real PDF file, you can test
# with the internal demo text by slightly modifying the function for demonstration:

def get_sustainability_score_demo(pdf_path: str = "demo_text_analysis") -> float:
    """
    DEMO VERSION: Analyzes the internal demo text and returns the score.
    """
    original_level = logging.getLogger().level
    try:
        logging.getLogger().setLevel(logging.WARNING)
        analyzer = ImpactSustainabilityAnalyzer()

        if pdf_path == "demo_text_analysis":
            text_input = analyzer._get_demo_text()
            # Calculate scores using the demo text instead of a PDF path
            total_score, _ = analyzer.calculate_impact_sustainability_score(text=text_input)
        else:
             total_score, _ = analyzer.calculate_impact_sustainability_score(pdf_path=pdf_path)

        return total_score

    except Exception as e:
        logger.error(f"Error in demo processing: {str(e)}")
        return 0.0
    finally:
        logging.getLogger().setLevel(original_level)

# # Uncomment the block below to run a test with the internal demo text
# if __name__ == "__main__":
#     print("\n" + "="*50)
#     print("🚀 Test Run: get_sustainability_score_demo")
#     print(" (Analysis prints suppressed; only final score returned)")
#     print("="*50)
#
#     # This runs the demo text, which should score the maximum 20.0
#     final_score = get_sustainability_score_demo()
#
#     print(f"\n✅ Final Sustainability Score (Demo Text): {final_score:.1f}")
#
#     print("\n" + "="*50)
#     print("🚀 Test Run: get_sustainability_score (Real PDF)")
#     print(" (This will likely fail unless 'sampleDPR1.pdf' exists)")
#     print("="*50)
#
#     # This runs the real function which requires an actual PDF
#     try:
#         score_real = get_sustainability_score("sampleDPR1.pdf")
#         print(f"\nFinal Sustainability Score (sampleDPR1.pdf): {score_real:.1f}")
#     except Exception as e:
#         print(f"\nNote: The full function failed due to missing file (expected): {e}")

In [None]:
def detect_missing_sustainability_components(pdf_path=None, text=None, verbose=True):
    """
    Detects which Sustainability & Impact components are missing from a DPR.
    Returns a dict with missing components and their details.

    Args:
        pdf_path: Path to the PDF file to analyze
        text: Direct text input (alternative to pdf_path)
        verbose: Print detailed output

    Returns:
        {
            'missing_components': [...],
            'missing_count': int,
            'total_possible_points': 20,
            'obtainable_points': int,
            'analysis_results': {detailed scores and evidence}
        }
    """

    # Run the full analysis
    analyzer = ImpactSustainabilityAnalyzer()

    try:
        total_score, results = analyzer.calculate_impact_sustainability_score(
            pdf_path=pdf_path,
            text=text
        )
    except Exception as e:
        if verbose:
            print(f"Analysis Error: {str(e)}")
        return {
            'error': str(e),
            'missing_components': ['Unable to analyze - extraction failed'],
            'pdf_path': pdf_path or 'text_input'
        }

    missing_components = []
    obtainable_points = 0

    # Component 1: Clear beneficiary identification (5 pts)
    criterion = "Clear beneficiary identification"
    if criterion in results:
        score = results[criterion].score
        if score == 0:
            missing_components.append({
                'component': criterion,
                'max_points': 5,
                'current_score': 0,
                'status': 'MISSING',
                'semantic_match': results[criterion].semantic_match,
                'keyword_match': results[criterion].keyword_match,
                'evidence': results[criterion].evidence
            })
        else:
            obtainable_points += 5

    # Component 2: SDG/MPI alignment (5 pts)
    criterion = "SDG/MPI alignment"
    if criterion in results:
        score = results[criterion].score
        if score == 0:
            missing_components.append({
                'component': criterion,
                'max_points': 5,
                'current_score': 0,
                'status': 'MISSING',
                'semantic_match': results[criterion].semantic_match,
                'keyword_match': results[criterion].keyword_match,
                'evidence': results[criterion].evidence
            })
        else:
            obtainable_points += 5

    # Component 3: O&M plan for 4+ years (5 pts)
    criterion = "O&M plan for 4+ years"
    if criterion in results:
        score = results[criterion].score
        if score == 0:
            missing_components.append({
                'component': criterion,
                'max_points': 5,
                'current_score': 0,
                'status': 'MISSING',
                'semantic_match': results[criterion].semantic_match,
                'keyword_match': results[criterion].keyword_match,
                'evidence': results[criterion].evidence
            })
        else:
            obtainable_points += 5

    # Component 4: Output-outcome framework with KPIs (5 pts)
    criterion = "Output-outcome framework with KPIs"
    if criterion in results:
        score = results[criterion].score
        if score == 0:
            missing_components.append({
                'component': criterion,
                'max_points': 5,
                'current_score': 0,
                'status': 'MISSING',
                'semantic_match': results[criterion].semantic_match,
                'keyword_match': results[criterion].keyword_match,
                'evidence': results[criterion].evidence
            })
        else:
            obtainable_points += 5

    # Build return object
    return {
        'pdf_path': pdf_path or 'text_input',
        'missing_components': missing_components,
        'missing_count': len(missing_components),
        'total_possible_points': 20,
        'obtainable_points': obtainable_points,
        'points_at_risk': 20 - obtainable_points,
        'analysis_results': results,
        'total_score': total_score,
        'summary': {
            'Clear beneficiary identification': {
                'score': results["Clear beneficiary identification"].score,
                'max': 5,
                'status': 'Present' if results["Clear beneficiary identification"].score > 0 else 'MISSING'
            },
            'SDG/MPI alignment': {
                'score': results["SDG/MPI alignment"].score,
                'max': 5,
                'status': 'Present' if results["SDG/MPI alignment"].score > 0 else 'MISSING'
            },
            'O&M plan for 4+ years': {
                'score': results["O&M plan for 4+ years"].score,
                'max': 5,
                'status': 'Present' if results["O&M plan for 4+ years"].score > 0 else 'MISSING'
            },
            'Output-outcome framework with KPIs': {
                'score': results["Output-outcome framework with KPIs"].score,
                'max': 5,
                'status': 'Present' if results["Output-outcome framework with KPIs"].score > 0 else 'MISSING'
            }
        }
    }


def print_missing_sustainability_report(detection_results):
    """
    Prints a formatted report of missing Sustainability & Impact components.

    Args:
        detection_results: Output from detect_missing_sustainability_components()
    """

    if 'error' in detection_results:
        print(f"\nAnalysis Error: {detection_results['error']}\n")
        return

    print("\n" + "="*70)
    print("       SUSTAINABILITY & IMPACT MISSING COMPONENTS ANALYSIS")
    print("="*70)

    print(f"\nFile: {detection_results['pdf_path']}")

    missing_count = detection_results['missing_count']

    if missing_count == 0:
        print("\nAll components present!")
        print(f"\n   Total Points Available: {detection_results['obtainable_points']}/20")
    else:
        print(f"\nMISSING COMPONENTS: {missing_count}")
        print(f"   Points at Risk: {detection_results['points_at_risk']}/20")
        print(f"   Maximum Obtainable: {detection_results['obtainable_points']}/20\n")

        print("-"*70)
        for i, component in enumerate(detection_results['missing_components'], 1):
            print(f"\n{i}. {component['component'].upper()}")
            print(f"   Max Points: {component['max_points']}")
            print(f"   Current Score: {component['current_score']}")
            print(f"   Status: {component['status']}")
            print(f"   Semantic Match: {'Yes' if component['semantic_match'] else 'No'}")
            print(f"   Keyword Match: {'Yes' if component['keyword_match'] else 'No'}")
            if component['evidence']:
                print(f"   Evidence Found: {len(component['evidence'])} instance(s)")
                for evidence in component['evidence'][:2]:
                    print(f"     - {evidence[:100]}...")

    print("\n" + "-"*70)
    print("COMPONENT STATUS SUMMARY:")
    print("-"*70)

    summary = detection_results['summary']
    for comp_name, comp_data in summary.items():
        status_icon = "✓" if comp_data['status'] == 'Present' else "✗"
        print(f"  {status_icon} {comp_name}: "
              f"{comp_data['score']}/{comp_data['max']} - {comp_data['status']}")

    print(f"\n  Total Score: {detection_results['total_score']:.1f}/20")

    print("\n" + "="*70 + "\n")


def get_missing_components_list(detection_results):
    """
    Returns a simple list of missing component names.

    Args:
        detection_results: Output from detect_missing_sustainability_components()

    Returns:
        List of missing component names as strings
    """
    return [c['component'] for c in detection_results['missing_components']]


def generate_missing_components_summary(detection_results):
    """
    Generates a summary dictionary for missing components.

    Args:
        detection_results: Output from detect_missing_sustainability_components()

    Returns:
        Dictionary with summary information
    """
    return {
        'pdf_file': detection_results['pdf_path'],
        'is_complete': detection_results['missing_count'] == 0,
        'missing_components_count': detection_results['missing_count'],
        'points_lost': detection_results['points_at_risk'],
        'maximum_score': detection_results['obtainable_points'],
        'missing_list': get_missing_components_list(detection_results),
        'total_sustainability_score': detection_results['total_score'],
        'status_breakdown': detection_results['summary']
    }

COMPLIANCE

In [None]:
import os
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from google import genai
from google.genai import types
from google.genai.errors import APIError

# ============================================
#  CONFIGURATION
# ============================================
MODEL_NAME = "gemini-2.5-flash"
from getpass import getpass

GEMINI_KEY_STRING = os.getenv("GEMINI_KEY_STRING")
if not GEMINI_KEY_STRING:
    GEMINI_KEY_STRING = getpass("Enter your Gemini API key (hidden): ")

# ✅ Permanent question
FIXED_QUESTION = "What is the total budget and is it between 20 crore and 500 crore?"

# ✅ Marks rule
MARKS_IF_YES = 10

# ============================================
#  GEMINI CLIENT INITIALIZATION
# ============================================
try:
    client = genai.Client(api_key=GEMINI_KEY_STRING)
except Exception as e:
    print(f"Error initializing Gemini client: {e}")
    exit(1)

# ============================================
#  PDF TEXT EXTRACTION
# ============================================
def extract_text_from_pdf(pdf_path: str) -> str:
    print(f"Extracting text from: {pdf_path}...")
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for i, page in enumerate(reader.pages):
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n\n"
        if not text.strip():
            print(" Warning: Extracted text is empty.")
        else:
            print(f" Extracted {len(text)} characters from {len(reader.pages)} pages.")
        return text
    except FileNotFoundError:
        print(f" File not found: {pdf_path}")
        return ""
    except PdfReadError:
        print(f" Could not read PDF (maybe corrupt or encrypted): {pdf_path}")
        return ""
    except Exception as e:
        print(f"Unexpected error: {e}")
        return ""

# ============================================
#  SYSTEM INSTRUCTION
# ============================================
SYSTEM_INSTRUCTION = """
You are a DPR Question Answering AI assistant.
Read the project document text and answer the question precisely.
If the answer is not in the document, reply: "Not mentioned in the document."
"""

# ============================================
#  GEMINI QA FUNCTION
# ============================================
def query_dpr(dpr_text: str) -> str:
    if not dpr_text:
        return "No content to analyze."

    prompt = f"Document:\n{dpr_text}\n\nQuestion: {FIXED_QUESTION}\nAnswer:"

    try:
        response = client.models.generate_content(
            model=MODEL_NAME,
            contents=[prompt],
            config=types.GenerateContentConfig(
                system_instruction=SYSTEM_INSTRUCTION,
                response_mime_type="text/plain"
            )
        )
        return response.text.strip() or "No clear answer found."
    except APIError as e:
        return f"API Error: {e}"
    except Exception as e:
        return f"Unexpected error: {e}"

# ============================================
#  MARKS EVALUATION
# ============================================
def evaluate_answer_marks(answer: str) -> int:
    if not answer:
        return 0
    if "yes" in answer.lower():
        return MARKS_IF_YES
    return 0

# ============================================
#  DISPLAY RESULT
# ============================================
def display_answer(answer: str, file_path: str, marks: int):
    print("\n" + "="*80)
    print("                DPR PROJECT QUESTION-ANSWER SYSTEM                   ")
    print("="*80 + "\n")
    print(f" PDF FILE ANALYZED: {file_path}\n")
    print(f" Question: {FIXED_QUESTION}\n")
    print(f" Answer: {answer}\n")
    print(f" Marks Awarded: {marks}/{MARKS_IF_YES}\n")
    print("="*80)

# ============================================
#  MAIN FUNCTION
# ============================================
def analyze_dpr(pdf_file_path: str):
    dpr_text = extract_text_from_pdf(pdf_file_path)

    if not dpr_text:
        print(" Failed to extract text from PDF.")
        return None, 0

    answer = query_dpr(dpr_text)
    marks = evaluate_answer_marks(answer)
    # display_answer(answer, pdf_file_path, marks)
    return answer, marks


In [None]:
def total_marks(pdf_file):
    completeness = validate_dpr_with_marks(pdf_file)
    comp_score = completeness[1] if completeness else 0
    tech_score = get_tech_score(pdf_file)
    sustain_score = get_sustainability_score(pdf_file)
    compliance_score = analyze_dpr(pdf_file)[1]
    total_score = comp_score + tech_score + sustain_score + compliance_score
    return total_score

# print(comp_score)
# print(tech_score)
# print(sustain_score)
# print(compliance_score)

ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet


Extracting text from: sampleDPR4.pdf...


ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advance

✅ Extracted 86706 characters from 73 pages.
Extracting text from: sampleDPR4.pdf...


ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advance

✅ Extracted 86706 characters from 73 pages.
[DEBUG] Section 'Project Profile', Key Point: 'Satellite image or photograph of project site', Max similarity: 0.85
[DEBUG] Section 'Beneficiary & Impact Analysis', Key Point: 'Alignment with Sustainable Development Goals (SDG) or Multidimensional Poverty Index (MPI)', Max similarity: 0.81
[DEBUG] Section 'Beneficiary & Impact Analysis', Key Point: 'Output-Outcome framework with Key Performance Indicators (KPIs)', Max similarity: 0.80
[DEBUG] Section 'Technical Specifications', Key Point: 'Alignment with GatiShakti Master Plan', Max similarity: 0.82
[DEBUG] Section 'Technical Specifications', Key Point: 'Compliance with concerned line department guidelines', Max similarity: 0.84
[DEBUG] Section 'Financial Details', Key Point: 'Cost estimates based on latest Schedule of Rates (SOR)', Max similarity: 0.87
[DEBUG] Section 'Financial Details', Key Point: 'Operations & Maintenance (O&M) cost for first 4 years included in total project cost', Max s

ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet


Extracting text from: sampleDPR4.pdf...


ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advanced encoding /SymbolSetEncoding not implemented yet
ERROR:pypdf._cmap:Advance

✅ Extracted 86706 characters from 73 pages.

                DPR PROJECT QUESTION-ANSWER SYSTEM                   

📄 PDF FILE ANALYZED: sampleDPR4.pdf

❓ Question: What is the total budget and is it between 20 crore and 500 crore?

💡 Answer: The total budget for the project is Rs. 50.88 crore.
Yes, it is between 20 crore and 500 crore.

🏆 Marks Awarded: 5/5

31.666666666666668
24.0
19.0
5
