In [None]:
%pip install groq pydantic pydantic-settings spacy

In [57]:
import os
from typing import List, Dict, Optional
from enum import Enum
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings
from groq import Groq
from neo4j import GraphDatabase
import spacy
import json
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [58]:
# Configuration Management
class Settings(BaseSettings):
    GROQ_API_KEY: str
    NEO4J_URI: str
    NEO4J_USERNAME: str
    NEO4J_PASSWORD: str
    GROQ_MODEL: str = "deepseek-r1-distill-llama-70b"

    ALLOWED_ENTITY_TYPES: List[str] = Field(default=[
        "concept", "chapter", "subject", "formula", 
        "diagram", "theorem", "example", "exercise",
        "activity", "experiment", "definition",  # Added
        "historical_figure", "scientist", "mathematician",
        "table", "equation"  # Added
    ])
    ALLOWED_RELATIONSHIPS: List[str] = Field(default=[
        "part_of", "related_to", "depends_on", 
        "applies_to", "proved_by", "illustrated_by",
        "authored_by", "appears_in", "prerequisite_for"
    ])
    
    class Config:
        env_file = ".env"

In [59]:
# Pydantic Models
class EntityType(str, Enum):
    CONCEPT = "concept"
    CHAPTER = "chapter"
    SUBJECT = "subject"
    FORMULA = "formula"
    DIAGRAM = "diagram"
    THEOREM = "theorem"
    EXAMPLE = "example"
    EXERCISE = "exercise"
    ACTIVITY = "activity"  # Added
    EXPERIMENT = "experiment"  # Added
    DEFINITION = "definition"  # Added
    HISTORICAL_FIGURE = "historical_figure"
    SCIENTIST = "scientist"
    MATHEMATICIAN = "mathematician"
    TABLE = "table"  # Added
    EQUATION = "equation"  # Added

class Entity(BaseModel):
    name: str
    type: EntityType
    properties: Dict[str, str] = Field(default_factory=dict)
    
    def validate_entity_type(cls, v):
        if v not in Settings().ALLOWED_ENTITY_TYPES:
            raise ValueError(f"Invalid entity type: {v}. Allowed types: {Settings().ALLOWED_ENTITY_TYPES}")
        return v

In [60]:
class Relationship(BaseModel):
    source: str
    target: str
    type: str
    properties: Dict[str, str] = Field(default_factory=dict)
    
    def validate_relationship_type(cls, v):
        if v not in Settings().ALLOWED_RELATIONSHIPS:
            raise ValueError(f"Invalid relationship type: {v}. Allowed types: {Settings().ALLOWED_RELATIONSHIPS}")
        return v

In [61]:
class KnowledgeGraphData(BaseModel):
    entities: List[Entity]
    relationships: List[Relationship]
    source_text: Optional[str] = None
    book_reference: Optional[str] = None
    grade_level: Optional[str] = None

In [62]:
# Knowledge Graph Builder
class NCERTKnowledgeGraphBuilder:
    def __init__(self):
        self.settings = Settings()
        self.groq_client = Groq(api_key=self.settings.GROQ_API_KEY)
        self.nlp = spacy.load("en_core_web_sm")
        self.driver = GraphDatabase.driver(
            self.settings.NEO4J_URI,
            auth=(self.settings.NEO4J_USERNAME, self.settings.NEO4J_PASSWORD)
        )
    
    def extract_kg_elements(self, text: str, context: Optional[Dict] = None) -> Dict:
        """Extract KG elements with schema enforcement through prompt only"""
        prompt = self._build_prompt(text, context)
        
        try:
            response = self.groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=self.settings.GROQ_MODEL,
                response_format={"type": "json_object"},
                temperature=0.1  # Lower temperature for more consistent formatting
            )
            
            kg_data = json.loads(response.choices[0].message.content)
            
            # Basic cleanup without strict validation
            if 'relationships' in kg_data:
                for rel in kg_data['relationships']:
                    # Ensure 'type' field exists
                    if 'relation' in rel:
                        rel['type'] = rel.pop('relation')
                    
            if context:
                kg_data.update(context)
                
            return kg_data
            
        except json.JSONDecodeError:
            raise ValueError("Invalid JSON response from Groq API")
        except Exception as e:
            raise ValueError(f"Failed to process KG data: {str(e)}")
    
    def _build_prompt(self, text: str, context: Optional[Dict]) -> str:
        """Construct a strict prompt that enforces the schema through instructions"""
        context_info = ""
        if context:
            context_info = (
                f"\nAdditional Context:\n"
                f"- Book: {context.get('book_reference', 'unknown')}\n"
                f"- Grade: {context.get('grade_level', 'unknown')}\n"
                f"- Subject: {context.get('subject', 'unknown')}\n"
            )
        
        return f"""
        Extract knowledge graph elements from NCERT textbook content following these STRICT rules:

        1. OUTPUT FORMAT (MUST follow exactly):
        {{
            "entities": [
                {{
                    "name": "entity_name",
                    "type": "allowed_type",
                    "properties": {{
                        "key": "value"
                    }}
                }}
            ],
            "relationships": [
                {{
                    "source": "source_entity_name",
                    "target": "target_entity_name",
                    "type": "relationship_type",
                    "properties": {{
                        "key": "value"
                    }}
                }}
            ]
        }}

        2. ENTITY TYPES (must use exactly these):
        - concept, chapter, subject, formula, diagram, theorem, 
        - example, exercise, activity, experiment, definition,
        - historical_figure, scientist, mathematician, table, equation

        3. RELATIONSHIP TYPES (must use exactly these):
        - part_of, related_to, depends_on, applies_to, proved_by,
        - illustrated_by, authored_by, appears_in, prerequisite_for,
        - instance_of, method_of, component_of, example_of

        4. SPECIAL INSTRUCTIONS:
        - Always include both "source" and "target" in relationships
        - Use "type" field for relationship type (not "relation")
        - For questions → type="exercise"
        - For figures/diagrams → type="diagram"
        - For math expressions → type="formula"

        {context_info}

        Textbook content to analyze:
        {text}

        Return ONLY the JSON output matching the exact format above.
        
            Extract knowledge graph elements with these additional rules:
    
        5. LABEL SIMPLIFICATION RULES:
        - Use simple, single-word labels where possible
        - For complex concepts: 
            - "chemical compound" → "compound"
            - "periodic table element" → "element"
            - "scientific phenomenon" → "phenomenon"
        - Replace spaces with underscores
        """
    
    def store_in_neo4j(self, kg_data: Dict):
        """Store extracted knowledge graph in Neo4j"""
        with self.driver.session() as session:
            # Create entities (nodes)
            for entity in kg_data.get("entities", []):
                session.execute_write(
                    self._create_entity_node,
                    entity.get("name", ""),
                    entity.get("type", "concept"),  # Default to 'concept' if missing
                    entity.get("properties", {})
                )
            
            # Create relationships
            for rel in kg_data.get("relationships", []):
                session.execute_write(
                    self._create_relationship,
                    rel.get("source", ""),
                    rel.get("target", ""),
                    rel.get("type", "related_to"),  # Default to 'related_to' if missing
                    rel.get("properties", {})
                )
            
            # Add metadata if available
            if kg_data.get("source_text") or kg_data.get("book_reference"):
                session.execute_write(
                    self._add_metadata,
                    kg_data.get("source_text"),
                    kg_data.get("book_reference"),
                    kg_data.get("grade_level")
                )
    
    def _create_entity_node(self, tx, name: str, label: str, properties: Dict):
        """Create a node with sanitized inputs"""
        # Sanitize inputs
        safe_label = self._sanitize_neo4j_input(label)
        safe_name = self._sanitize_neo4j_input(name)
        
        # Sanitize property keys and values
        safe_properties = {
            self._sanitize_neo4j_input(k): self._sanitize_neo4j_input(str(v))
            for k, v in properties.items()
        }
        
        props_str = ", ".join([f"{k}: ${k}" for k in safe_properties.keys()])
        query = (
            f"MERGE (n:`{safe_label}` {{name: $name"
            f"{', ' + props_str if props_str else ''}}})"
            f" SET n.created_at = datetime()"
        )
        params = {"name": safe_name, **safe_properties}
        tx.run(query, **params)

    def _create_relationship(self, tx, source: str, target: str, rel_type: str, properties: Dict):
        """Create a relationship with sanitized inputs"""
        safe_rel_type = self._sanitize_neo4j_input(rel_type)
        safe_source = self._sanitize_neo4j_input(source)
        safe_target = self._sanitize_neo4j_input(target)
        
        safe_properties = {
            self._sanitize_neo4j_input(k): self._sanitize_neo4j_input(str(v))
            for k, v in properties.items()
        }
        
        props_str = ", ".join([f"{k}: ${k}" for k in safe_properties.keys()])
        query = (
            f"MATCH (a), (b) "
            f"WHERE a.name = $source AND b.name = $target "
            f"MERGE (a)-[r:`{safe_rel_type}` "
            f"{'{' + props_str + '}' if props_str else ''}]->(b)"
            f" SET r.created_at = datetime()"
        )
        params = {"source": safe_source, "target": safe_target, **safe_properties}
        tx.run(query, **params)
        
    def _add_metadata(self, tx, source_text: Optional[str], book_ref: Optional[str], grade: Optional[str]):
        """Add processing metadata to the graph"""
        if book_ref:
            tx.run(
                "MERGE (m:Metadata {book_reference: $book_ref}) "
                "SET m.last_updated = datetime(), "
                "m.grade_level = $grade, "
                "m.processed = true",
                book_ref=book_ref,
                grade=grade
            )
    
    def process_material(self, text: str, context: Optional[Dict] = None) -> Dict:
        """Full processing pipeline that returns raw dictionary"""
        # Preprocessing
        doc = self.nlp(text)
        clean_text = " ".join([
            token.lemma_.lower() for token in doc 
            if not token.is_stop and not token.is_punct
        ])
        
        # Knowledge extraction
        kg_data = self.extract_kg_elements(clean_text, context)
        
        # Ensure basic structure exists
        kg_data.setdefault("entities", [])
        kg_data.setdefault("relationships", [])
        
        # Store in Neo4j
        self.store_in_neo4j(kg_data)
        
        return kg_data
    
    def load_and_process_pdf(self, file_path: str, context: Dict) -> KnowledgeGraphData:
        """Load and process a single NCERT PDF file"""
        try:
            # Load PDF
            loader = PyPDFLoader(file_path)
            pages = loader.load()
            
            # Split text into manageable chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len
            )
            chunks = text_splitter.split_documents(pages)
            
            # Process each chunk
            results = []
            for chunk in chunks:
                result = self.process_material(chunk.page_content, context)
                results.append(result)
            
            return results
        
        except Exception as e:
            raise ValueError(f"Error processing PDF {file_path}: {str(e)}")

    def _sanitize_neo4j_input(self, input_str: str) -> str:
        """Sanitize strings for Neo4j labels and properties"""
        if not input_str:
            return "unknown"
        
        # Remove special characters and replace spaces
        sanitized = "".join(
            c if c.isalnum() or c in "_-" else "_" 
            for c in input_str.strip()
        )
        
        # Ensure it starts with a letter
        if sanitized and not sanitized[0].isalpha():
            sanitized = "e_" + sanitized
            
        return sanitized[:64]  # Limit length


In [63]:
builder = NCERTKnowledgeGraphBuilder()

In [50]:
context = {
    "book_reference": "NCERT Science Class 10",
    "grade_level": "10",
    "subject": "Science"
}

try:
    # Process a single PDF file
    pdf_path = "./books/ncert_class_x_science_chapter_1.pdf"
    results = builder.load_and_process_pdf(pdf_path, context)
    
    for i, result in enumerate(results):
        print(f"\nProcessed chunk {i+1}:")
        print("Entities found:", len(result.get("entities", [])))
        print("Relationships found:", len(result.get("relationships", [])))
        
        # Example of accessing data
        for entity in result.get("entities", [])[:3]:  # Print first 3 entities
            print(f"- {entity.get('name')} ({entity.get('type')})")
        
except Exception as e:
    print(f"Error processing material: {str(e)}")


Processed chunk 1:
Entities found: 9
Relationships found: 10
- chemical_reactions_equations (chapter)
- milk_room_temperature (example)
- iron_tawa_humid_atmosphere (example)

Processed chunk 2:
Entities found: 10
Relationships found: 10
- figure_1.1 (diagram)
- activity_1.1 (activity)
- magnesium_ribbon_burning (concept)

Processed chunk 3:
Entities found: 5
Relationships found: 5
- activity_1.2 (activity)
- activity_1.3 (activity)
- figure_1.2 (diagram)

Processed chunk 4:
Entities found: 7
Relationships found: 4
- magnesium (element)
- oxygen (element)
- magnesium_oxide (compound)

Processed chunk 5:
Entities found: 5
Relationships found: 4
- word_equation (concept)
- magnesium (element)
- oxygen (element)

Processed chunk 6:
Entities found: 6
Relationships found: 5
- chemical_equation (concept)
- chemical_formula (concept)
- magnesium (element)

Processed chunk 7:
Entities found: 10
Relationships found: 9
- balanced_chemical_equations (concept)
- law_conservation_mass (concept)
- 

In [51]:
try:
    # Process a single PDF file
    pdf_path = "./books/ncert_class_x_science_chapter_2.pdf"
    results = builder.load_and_process_pdf(pdf_path, context)
    
    for i, result in enumerate(results):
        print(f"\nProcessed chunk {i+1}:")
        print("Entities found:", len(result.get("entities", [])))
        print("Relationships found:", len(result.get("relationships", [])))
        
        # Example of accessing data
        for entity in result.get("entities", [])[:3]:  # Print first 3 entities
            print(f"- {entity.get('name')} ({entity.get('type')})")
        
except Exception as e:
    print(f"Error processing material: {str(e)}")


Processed chunk 1:
Entities found: 5
Relationships found: 5
- acid (concept)
- basis (concept)
- litmus_indicator (indicator)

Processed chunk 2:
Entities found: 7
Relationships found: 4
- indicator (concept)
- acid (concept)
- base (concept)

Processed chunk 3:
Entities found: 7
Relationships found: 6
- acid_base_indicator (concept)
- indicator (concept)
- red_cabbage (example)

Processed chunk 4:
Entities found: 5
Relationships found: 4
- acids (concept)
- bases (concept)
- activity_2.1 (experiment)

Processed chunk 5:
Entities found: 16
Relationships found: 15
- hydrochloric_acid (compound)
- sulphuric_acid (compound)
- nitric_acid (compound)

Processed chunk 6:
Entities found: 10
Relationships found: 6
- table_2.1 (table)
- phenolphthalein_solution (compound)
- methyl_orange_solution (compound)

Processed chunk 7:
Entities found: 6
Relationships found: 5
- acids_bases_salts (concept)
- activity_2.1 (activity)
- activity_2.3 (activity)

Processed chunk 8:
Entities found: 6
Relation

In [52]:
try:
    # Process a single PDF file
    pdf_path = "./books/ncert_class_x_science_chapter_3.pdf"
    results = builder.load_and_process_pdf(pdf_path, context)
    
    for i, result in enumerate(results):
        print(f"\nProcessed chunk {i+1}:")
        print("Entities found:", len(result.get("entities", [])))
        print("Relationships found:", len(result.get("relationships", [])))
        
        # Example of accessing data
        for entity in result.get("entities", [])[:3]:  # Print first 3 entities
            print(f"- {entity.get('name')} ({entity.get('type')})")
        
except Exception as e:
    print(f"Error processing material: {str(e)}")


Processed chunk 1:
Entities found: 11
Relationships found: 8
- metals (concept)
- non_metals (concept)
- Chapter_3 (chapter)

Processed chunk 2:
Entities found: 11
Relationships found: 11
- iron (element)
- copper (element)
- aluminium (element)

Processed chunk 3:
Entities found: 5
Relationships found: 4
- activity_3.3 (activity)
- activity_3.4 (activity)
- malleability (property)

Processed chunk 4:
Entities found: 8
Relationships found: 6
- gold (metal)
- malleability (property)
- ductility (property)

Processed chunk 5:
Entities found: 8
Relationships found: 7
- metal (concept)
- activity_show_metal_good_conductor_heat (activity)
- figure_3.1 (diagram)

Processed chunk 6:
Entities found: 10
Relationships found: 8
- metal (concept)
- non_metal (concept)
- activity_3.6 (activity)

Processed chunk 7:
Entities found: 8
Relationships found: 7
- electric_circuit_diagram (diagram)
- metal_test_circuit_activity (activity)
- carbon (element)

Processed chunk 8:
Entities found: 7
Relationsh

In [64]:
try:
    # Process a single PDF file
    pdf_path = "./books/ncert_class_x_science_chapter_4.pdf"
    results = builder.load_and_process_pdf(pdf_path, context)
    
    for i, result in enumerate(results):
        print(f"\nProcessed chunk {i+1}:")
        print("Entities found:", len(result.get("entities", [])))
        print("Relationships found:", len(result.get("relationships", [])))
        
        # Example of accessing data
        for entity in result.get("entities", [])[:3]:  # Print first 3 entities
            print(f"- {entity.get('name')} ({entity.get('type')})")
        
except Exception as e:
    print(f"Error processing material: {str(e)}")

Error processing material: Error processing PDF ./books/ncert_class_x_science_chapter_4.pdf: Failed to process KG data: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


In [65]:
try:
    # Process a single PDF file
    pdf_path = "./books/ncert_class_x_science_chapter_5.pdf"
    results = builder.load_and_process_pdf(pdf_path, context)
    
    for i, result in enumerate(results):
        print(f"\nProcessed chunk {i+1}:")
        print("Entities found:", len(result.get("entities", [])))
        print("Relationships found:", len(result.get("relationships", [])))
        
        # Example of accessing data
        for entity in result.get("entities", [])[:3]:  # Print first 3 entities
            print(f"- {entity.get('name')} ({entity.get('type')})")
        
except Exception as e:
    print(f"Error processing material: {str(e)}")

Error processing material: Error processing PDF ./books/ncert_class_x_science_chapter_5.pdf: Failed to process KG data: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


In [None]:
try:
    # Process a single PDF file
    pdf_path = "./books/ncert_class_x_science_chapter_5.pdf"
    results = builder.load_and_process_pdf(pdf_path, context)
    
    for i, result in enumerate(results):
        print(f"\nProcessed chunk {i+1}:")
        print("Entities found:", len(result.get("entities", [])))
        print("Relationships found:", len(result.get("relationships", [])))
        
        # Example of accessing data
        for entity in result.get("entities", [])[:3]:  # Print first 3 entities
            print(f"- {entity.get('name')} ({entity.get('type')})")
        
except Exception as e:
    print(f"Error processing material: {str(e)}")

In [None]:
try:
    # Process a single PDF file
    pdf_path = "./books/ncert_class_x_science_chapter_7.pdf"
    results = builder.load_and_process_pdf(pdf_path, context)
    
    for i, result in enumerate(results):
        print(f"\nProcessed chunk {i+1}:")
        print("Entities found:", len(result.get("entities", [])))
        print("Relationships found:", len(result.get("relationships", [])))
        
        # Example of accessing data
        for entity in result.get("entities", [])[:3]:  # Print first 3 entities
            print(f"- {entity.get('name')} ({entity.get('type')})")
        
except Exception as e:
    print(f"Error processing material: {str(e)}")