In [106]:
import os

In [198]:
os.listdir()

['core-ontology.ipynb',
 '.ruff_cache',
 'schema_output.jsonld',
 'schema.txt',
 'mypy.ini',
 'schema.ttl',
 '.mypy_cache',
 'output_ontology',
 '.ipynb_checkpoints',
 'schema.jsonld']

In [199]:
from rdflib import Graph

In [200]:
schema_filepath = "schema.ttl"

# Step 1: Parsing from schema.org ttl file

In [201]:
import rdflib
import json
import logging

# Configure basic logging
logging.basicConfig(level=logging.INFO)

# Define the input schema file path (assuming schema.txt is in the same directory)
# In a real application, this path would be configurable.
SCHEMA_FILE = "schema.txt"
SCHEMA_FORMAT = "turtle" # Explicitly state the format of schema.txt

# Define output path for JSON-LD (optional, for inspection)
JSON_LD_OUTPUT_FILE = "schema_output.jsonld"

def parse_schema_to_graph(file_path: str, file_format: str) -> rdflib.Graph | None:
    """
    Parses an RDF schema file into an rdflib Graph object.

    Args:
        file_path: Path to the schema file.
        file_format: The format of the RDF file (e.g., 'turtle', 'xml', 'json-ld').

    Returns:
        An rdflib.Graph object or None if parsing fails.
    """
    g = rdflib.Graph()
    try:
        logging.info(f"Attempting to parse schema file: {file_path} (format: {file_format})")
        g.parse(source=file_path, format=file_format)
        logging.info(f"Successfully parsed {len(g)} triples.")
        return g
    except FileNotFoundError:
        logging.error(f"Error: Schema file not found at {file_path}")
        return None
    except Exception as e:
        logging.error(f"Error parsing schema file '{file_path}' with format '{file_format}': {e}")
        return None

def serialize_graph_to_jsonld(graph: rdflib.Graph, output_file: str) -> bool:
    """
    Serializes an rdflib Graph object to a JSON-LD file.

    Args:
        graph: The rdflib.Graph object.
        output_file: Path to save the JSON-LD output file.

    Returns:
        True if serialization was successful, False otherwise.
    """
    if graph is None:
        logging.error("Cannot serialize: Graph object is None.")
        return False
    try:
        logging.info(f"Attempting to serialize graph to JSON-LD: {output_file}")
        # Common context for schema.org can help make JSON-LD more readable
        # Using a standard context URL
        context = {
            "@vocab": "https://schema.org/"
            # Add other prefixes if needed, though @vocab covers schema.org terms
        }
        # Serialize to JSON-LD format
        # Note: rdflib's json-ld serialization might produce a list of objects
        json_ld_data = graph.serialize(format='json-ld', context=context, indent=2)

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(json_ld_data)
        logging.info(f"Successfully serialized graph to {output_file}")
        return True
    except Exception as e:
        logging.error(f"Error serializing graph to JSON-LD: {e}")
        return False


In [202]:
# --- Main execution block for this step ---
if __name__ == "__main__":
    # Step 1: Parse the schema
    schema_graph = parse_schema_to_graph(SCHEMA_FILE, SCHEMA_FORMAT)

    if schema_graph:
        # Step 2 (as requested): Serialize to JSON-LD for inspection
        serialize_graph_to_jsonld(schema_graph, JSON_LD_OUTPUT_FILE)

        # Next steps (to be implemented in subsequent chunks) would involve
        # analyzing 'schema_graph' to extract class/property info
        # and then generating Pydantic models.
        logging.info("Graph parsed. Ready for Step 2: Schema Analysis (in next code chunk).")
    else:
        logging.error("Failed to parse schema graph. Cannot proceed.")

INFO:root:Attempting to parse schema file: schema.txt (format: turtle)
INFO:root:Successfully parsed 8904 triples.
INFO:root:Attempting to serialize graph to JSON-LD: schema_output.jsonld
INFO:root:Successfully serialized graph to schema_output.jsonld
INFO:root:Graph parsed. Ready for Step 2: Schema Analysis (in next code chunk).


In [203]:
g = parse_schema_to_graph(SCHEMA_FILE, SCHEMA_FORMAT)
g

INFO:root:Attempting to parse schema file: schema.txt (format: turtle)
INFO:root:Successfully parsed 8904 triples.


<Graph identifier=Na483d00186fa457fa32657c5fb34c138 (<class 'rdflib.graph.Graph'>)>

# Step 2: 

In [204]:
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, XSD # Common RDF/RDFS/OWL namespaces
from typing import List, Set, Dict, Optional, NamedTuple
import logging

# Assuming the 'parse_schema_to_graph' function from Chunk 1 exists
# and 'schema_graph' is the rdflib.Graph object returned by it.

# Define the schema.org namespace
SCHEMA = rdflib.Namespace("https://schema.org/")

In [205]:
# Define simple structures to hold extracted info
class PropertyInfo(NamedTuple):
    uri: rdflib.URIRef
    label: Optional[str]
    comment: Optional[str]
    domains: Set[rdflib.URIRef] # Classes where this property applies
    ranges: Set[rdflib.URIRef] # Expected types for this property's value

class ClassInfo(NamedTuple):
    uri: rdflib.URIRef
    label: Optional[str]
    comment: Optional[str]
    superclasses: Set[rdflib.URIRef] # Direct parent classes
    properties: Set[rdflib.URIRef] # Properties associated via domainIncludes


In [320]:
def get_label(graph: rdflib.Graph, subject: rdflib.URIRef) -> Optional[str]:
    """Gets the rdfs:label for a subject."""
    label = graph.value(subject=subject, predicate=RDFS.label)
    return str(label) if label else None

def get_comment(graph: rdflib.Graph, subject: rdflib.URIRef) -> Optional[str]:
    """Gets the rdfs:comment for a subject."""
    comment = graph.value(subject=subject, predicate=RDFS.comment)
    return str(comment) if comment else None

def find_schema_classes(graph: rdflib.Graph) -> Set[rdflib.URIRef]:
    """Finds all subjects defined as RDFS Classes within the schema.org namespace."""
    classes = set()
    # Find things explicitly declared as rdfs:Class or owl:Class
    for class_type in [RDFS.Class, OWL.Class]:
        for subject in graph.subjects(predicate=RDF.type, object=class_type):
            # Filter to include only those within the schema.org namespace
            if str(subject).startswith(str(SCHEMA)):
                 # Also check if it's a schema.org DataType, treat those differently later
                 is_datatype = (subject, RDF.type, SCHEMA.DataType) in graph
                 if not is_datatype:
                    classes.add(subject)
    # Schema.org also defines types like schema:Person without explicitly stating
    # rdf:type rdfs:Class in all serializations, but implies they are classes
    # by using them in domain/range or subClassOf. A more robust approach
    # might involve looking for usage in rdfs:subClassOf, :domainIncludes, :rangeIncludes
    # For now, primarily rely on explicit declaration if present.
    logging.info(f"Found {len(classes)} potential schema.org classes.")
    return classes

def find_schema_properties(graph: rdflib.Graph) -> Set[rdflib.URIRef]:
    """Finds all subjects defined as RDF Properties within the schema.org namespace."""
    properties = set()
    for subject in graph.subjects(predicate=RDF.type, object=RDF.Property):
         if str(subject).startswith(str(SCHEMA)):
             properties.add(subject)
    logging.info(f"Found {len(properties)} potential schema.org properties.")
    return properties

def get_property_details(graph: rdflib.Graph, prop_uri: rdflib.URIRef) -> PropertyInfo:
    """Extracts details for a given property URI."""
    label = get_label(graph, prop_uri)
    comment = get_comment(graph, prop_uri)
    domains = set(graph.objects(subject=prop_uri, predicate=SCHEMA.domainIncludes))
    ranges = set(graph.objects(subject=prop_uri, predicate=SCHEMA.rangeIncludes))
    return PropertyInfo(uri=prop_uri, label=label, comment=comment, domains=domains, ranges=ranges)

def get_class_details(graph: rdflib.Graph, class_uri: rdflib.URIRef, all_properties: Dict[rdflib.URIRef, PropertyInfo]) -> ClassInfo:
    """Extracts details for a given class URI."""
    label = get_label(graph, class_uri)
    comment = get_comment(graph, class_uri)
    superclasses = set(graph.objects(subject=class_uri, predicate=RDFS.subClassOf))
    # Find properties where this class is listed in the domain
    associated_properties = set()
    for prop_uri, prop_info in all_properties.items():
        if class_uri in prop_info.domains:
            associated_properties.add(prop_uri)

    # Also consider properties inherited from superclasses (requires recursive lookup - omitted for v0.1 simplicity)
    # For v0.1, we primarily care about properties directly associated via domainIncludes

    return ClassInfo(uri=class_uri, label=label, comment=comment, superclasses=superclasses, properties=associated_properties)

def analyze_schema_graph(graph: rdflib.Graph) -> Dict[str, Dict]:
    """
    Analyzes the RDF graph to extract structured info about classes and properties.

    Returns:
        A dictionary containing 'classes' and 'properties' information.
    """
    if not graph:
        logging.error("Analysis failed: Graph is empty or None.")
        return {"classes": {}, "properties": {}}

    schema_classes_uris = find_schema_classes(graph)
    schema_property_uris = find_schema_properties(graph)

    properties_info = {}
    for prop_uri in schema_property_uris:
        properties_info[prop_uri] = get_property_details(graph, prop_uri)

    classes_info = {}
    for class_uri in schema_classes_uris:
        # Skip RDFS/OWL base classes if they somehow got included
        if class_uri in [RDFS.Resource, OWL.Thing, RDFS.Class]:
             continue
        classes_info[class_uri] = get_class_details(graph, class_uri, properties_info)

    logging.info(f"Analyzed {len(classes_info)} classes and {len(properties_info)} properties.")
    return {"classes": classes_info, "properties": properties_info}


In [321]:
# --- Main execution block demonstrating analysis ---
if __name__ == "__main__":
    # Assume schema_graph is loaded from Step 1 (re-parse for standalone demo)
    schema_graph = parse_schema_to_graph(SCHEMA_FILE, SCHEMA_FORMAT)

    if schema_graph:
        # Step 2: Analyze the graph
        analyzed_schema = analyze_schema_graph(schema_graph)

        # Example: Print info for schema:Person and schema:address
        person_uri = SCHEMA.Person
        address_prop_uri = SCHEMA.address

        if person_uri in analyzed_schema["classes"]:
            print("\n--- Analysis for schema:Person ---")
            print(analyzed_schema["classes"][person_uri])
            print("-" * 30)

        if address_prop_uri in analyzed_schema["properties"]:
            print("\n--- Analysis for schema:address ---")
            print(analyzed_schema["properties"][address_prop_uri])
            print("-" * 30)

        # Output confirms we have structured data ready for mapping rules (Step 3)
        # and code generation (Step 4) in the next chunks.
        logging.info("Schema analysis complete. Ready for Step 3: Mapping Logic Definition.")
    else:
        logging.error("Failed to parse schema graph for analysis.")


INFO:root:Attempting to parse schema file: schema.txt (format: turtle)
INFO:root:Successfully parsed 8904 triples.
INFO:root:Found 628 potential schema.org classes.
INFO:root:Found 921 potential schema.org properties.
INFO:root:Analyzed 628 classes and 921 properties.
INFO:root:Schema analysis complete. Ready for Step 3: Mapping Logic Definition.



--- Analysis for schema:Person ---
ClassInfo(uri=rdflib.term.URIRef('https://schema.org/Person'), label='Person', comment='A person (alive, dead, undead, or fictional).', superclasses={rdflib.term.URIRef('https://schema.org/Thing')}, properties={rdflib.term.URIRef('https://schema.org/owns'), rdflib.term.URIRef('https://schema.org/skills'), rdflib.term.URIRef('https://schema.org/contactPoint'), rdflib.term.URIRef('https://schema.org/award'), rdflib.term.URIRef('https://schema.org/memberOf'), rdflib.term.URIRef('https://schema.org/jobTitle'), rdflib.term.URIRef('https://schema.org/spouse'), rdflib.term.URIRef('https://schema.org/seeks'), rdflib.term.URIRef('https://schema.org/birthPlace'), rdflib.term.URIRef('https://schema.org/address'), rdflib.term.URIRef('https://schema.org/naics'), rdflib.term.URIRef('https://schema.org/netWorth'), rdflib.term.URIRef('https://schema.org/vatID'), rdflib.term.URIRef('https://schema.org/makesOffer'), rdflib.term.URIRef('https://schema.org/additionalNam

# Step 3: 

In [322]:
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, XSD
from typing import List, Set, Dict, Optional, NamedTuple, Union, ForwardRef, Any
from pydantic import BaseModel, Field, EmailStr, AnyUrl
from datetime import date, datetime, time
import re
import keyword
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, XSD
from typing import List, Set, Dict, Optional, NamedTuple, Union, ForwardRef, Any, cast
# Added imports for richer types
from pydantic import BaseModel, Field, EmailStr, AnyUrl, constr, conint, condecimal, field_validator, model_validator
from datetime import date, datetime, time, timedelta
import isodate # Library to parse ISO 8601 durations
import decimal
import keyword
import logging
import re
import textwrap

In [323]:
class Quantity(BaseModel):
    """
    Base model for quantitative values based on schema.org/Quantity.
    Actual value and unit are often in subclasses or specific properties.
    This primarily serves as a conceptual base.
    """
    model_config = {'extra': 'allow'} # Allow extra fields as Quantity is generic

class Distance(Quantity):
    """
    Represents a distance based on schema.org/Distance.
    Uses value and unit representation common in QuantitativeValue.
    """
    # Based on properties commonly used with QuantitativeValue for distance
    value: Optional[float] = Field(None, description="The numerical value of the distance.")
    unitCode: Optional[str] = Field(None, description="UN/CEFACT Common Code (3 characters) or URL for the unit of measurement. E.g., 'MTR' for meter, 'KM' for kilometer, 'FT' for foot, 'INH' for inch.")
    unitText: Optional[str] = Field(None, description="A string indicating the unit of measurement. Useful if unitCode is not applicable or needs clarification. E.g., 'meters', 'miles'.")

    model_config = {'extra': 'forbid'}

    # Add validation if needed, e.g., check unitCode format

class Duration(Quantity):
    """
    Represents a duration based on schema.org/Duration.
    Stores duration as datetime.timedelta, parsed from ISO 8601 duration format.
    """
    # Pydantic doesn't have native ISO 8601 duration parsing, use validator
    value_iso8601: Optional[str] = Field(None, alias="iso8601Duration", description="Duration in ISO 8601 format (e.g., P1Y2M3DT4H5M6S).")
    value_timedelta: Optional[timedelta] = Field(None, exclude=True, description="Parsed timedelta value (internal).") # Exclude from standard model dump

    model_config = {'extra': 'forbid', 'populate_by_name': True} # Allow using alias on input

    @model_validator(mode='before')
    @classmethod
    def parse_duration(cls, data: Any) -> Any:
        if isinstance(data, dict):
            iso_duration_str = data.get("value_iso8601") or data.get("iso8601Duration")
            if iso_duration_str and isinstance(iso_duration_str, str):
                try:
                    # Use isodate library to parse ISO 8601 duration
                    td = isodate.parse_duration(iso_duration_str)
                    data['value_timedelta'] = td
                    # Keep original string too
                    data['value_iso8601'] = iso_duration_str
                except (isodate.ISO8601Error, ValueError) as e:
                    # Or raise validation error depending on strictness needed
                    logging.warning(f"Could not parse ISO 8601 duration '{iso_duration_str}': {e}")
                    data['value_timedelta'] = None # Set internal value to None on error
            # If timedelta is provided directly
            elif data.get('value_timedelta') and isinstance(data.get('value_timedelta'), timedelta):
                 # Optionally generate ISO string if needed, though complex
                 pass
        elif isinstance(data, str):
             # Allow direct initialization from ISO string
             try:
                 td = isodate.parse_duration(data)
                 return {'value_iso8601': data, 'value_timedelta': td}
             except (isodate.ISO8601Error, ValueError) as e:
                 logging.warning(f"Could not parse ISO 8601 duration string '{data}': {e}")
                 return {'value_iso8601': data, 'value_timedelta': None} # Return original string, mark as failed parse

        return data # Return dict for Pydantic processing

    def __str__(self) -> str:
        """Return ISO 8601 string representation."""
        if self.value_timedelta is not None:
             try:
                 # isodate can also format, but requires careful handling of years/months
                 # For simplicity, return original string if present, else standard timedelta str
                 return self.value_iso8601 or str(self.value_timedelta)
             except Exception:
                 return str(self.value_timedelta) # Fallback
        return self.value_iso8601 or "Invalid Duration"


class DefinedTerm(BaseModel):
    """
    Represents a term from a defined set based on schema.org/DefinedTerm.
    """
    # Core properties often associated with DefinedTerm
    termCode: Optional[str] = Field(None, description="A code that identifies this DefinedTerm within a DefinedTermSet.")
    name: Optional[str] = Field(None, description="The name of the item.")
    description: Optional[str] = Field(None, description="A description of the item.")
    # Allow referencing the set it belongs to, if known
    inDefinedTermSet: Optional[AnyUrl] = Field(None, description="A DefinedTermSet Organization or DataCatalog that contains this term.")

    model_config = {'extra': 'allow'} # Allow potential other properties from schema.org or extensions


In [324]:
# def safe_python_identifier(name: str) -> str:
#     """Converts a name to a valid Python identifier, handling keywords."""
#     if keyword.iskeyword(name):
#         return name + "_"
#     # Basic check for valid identifier start/characters - can be improved
#     if not name or not (name[0].isalpha() or name[0] == '_'):
#         name = '_' + name # Ensure valid start if needed
#     # Replace invalid characters (simplistic)
#     name = re.sub(r'\W|^(?=\d)', '_', name)
#     return name

# def map_uri_to_classname(uri: rdflib.URIRef) -> str:
#     """Converts a schema.org URI to a Python CamelCase class name."""
#     if not str(uri).startswith(str(SCHEMA)):
#         # Handle non-schema.org URIs if necessary, maybe return original or raise error
#         return str(uri).split('/')[-1].split('#')[-1] # Best guess
#     local_name = uri.replace(SCHEMA, "")
#     # Basic check for upper camel case, assuming schema.org mostly uses this
#     if local_name and local_name[0].isupper():
#         return safe_python_identifier(local_name)
#     else:
#         # Attempt to convert potentially lowerCamelCase or other cases
#         # This is a simple heuristic, might need refinement
#         parts = re.split(r'[-_ ]', local_name)
#         return safe_python_identifier("".join(part.capitalize() for part in parts))


# def map_uri_to_fieldname(uri: rdflib.URIRef) -> str:
#     """Converts a schema.org property URI to a Python snake_case field name."""
#     if not str(uri).startswith(str(SCHEMA)):
#          return safe_python_identifier(str(uri).split('/')[-1].split('#')[-1].lower()) # Best guess
#     local_name = uri.replace(SCHEMA, "")
#     # Convert camelCase or PascalCase to snake_case
#     s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', local_name)
#     snake_case_name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
#     return safe_python_identifier(snake_case_name)

# def map_range_to_typehint(
#     ranges: Set[rdflib.URIRef],
#     class_registry: Set[str], # Set of known generated class names
#     default_optional: bool = True,
#     use_list_for_multi: bool = True # Assume List for properties allowing multiple values in RDF? Risky default.
#                                      # Schema.org generally doesn't use OWL cardinality.
#                                      # A safer default might be Optional[Union[T1, T2, List[T1], List[T2]]]
#                                      # For v0.1, let's keep it simpler: use Optional[Union[...]]
# ) -> str:
#     """Maps a set of RDF range URIs to a Python type hint string."""
#     if not ranges:
#         return "typing.Any" # No range specified

#     mapped_types = set()
#     for r_uri in ranges:
#         if r_uri in TYPE_MAP:
#             mapped_types.add(TYPE_MAP[r_uri])
#         else:
#             # Assume it's a class defined in our ontology
#             class_name = map_uri_to_classname(r_uri)
#             if class_name in class_registry:
#                  # Use string literal for forward reference
#                 mapped_types.add(f"'{class_name}'")
#             elif str(r_uri) == str(SCHEMA.Thing) or str(r_uri) == str(RDFS.Resource):
#                  mapped_types.add("typing.Any") # Map generic Thing/Resource
#             else:
#                  # Unknown range URI - treat as Any or potentially raise error/warning
#                  logging.warning(f"Unknown range URI encountered: {r_uri}. Mapping to Any.")
#                  mapped_types.add("typing.Any")

#     # Remove duplicates and sort for consistent output
#     unique_types = sorted(list(mapped_types))

#     if not unique_types:
#          return "typing.Any" # Should not happen if ranges is not empty, but safety check

#     # Build the Union if multiple types
#     type_hint_core = ""
#     if len(unique_types) == 1:
#         type_hint_core = unique_types[0]
#     else:
#         type_hint_core = f"typing.Union[{', '.join(unique_types)}]"

#     # Handle Optionality (defaulting to Optional for v0.1 simplicity)
#     # A more advanced version would check OWL cardinality if present
#     if default_optional:
#         # Check if None is effectively already included via Optional[...] in the union parts
#         is_already_optional = any(t.startswith("typing.Optional[") or t == 'None' for t in unique_types)
#         if not is_already_optional:
#              return f"typing.Optional[{type_hint_core}]"
#         else:
#              # If Optional is already part of a Union, just return the Union
#              # e.g., Union[Optional['Thing'], str] is valid
#              # This logic might need refinement based on desired strictness
#              return type_hint_core
#     else:
#         return type_hint_core

# def get_field_metadata(prop_info: PropertyInfo) -> Dict[str, Union[str, Dict]]:
#     """Generates arguments for pydantic.Field based on PropertyInfo."""
#     args = {}
#     if prop_info.comment:
#         # Basic cleaning of comment string
#         clean_comment = ' '.join(prop_info.comment.split())
#         args['description'] = clean_comment
#     # Add aliases for common variations
#     if prop_info.uri in PROPERTY_ALIAS_MAP:
#         # Pydantic v2 alias handling might differ slightly, adjust as needed
#         for alias_type, alias_value in PROPERTY_ALIAS_MAP[prop_info.uri].items():
#              args[alias_type] = alias_value # e.g., validation_alias='dob'

#     # Example: Add examples if available (assuming they could be parsed from RDF)
#     # if prop_info.examples: args['examples'] = prop_info.examples

#     # Default value is None for Optional fields, handled by type hint + Field(None)
#     # Required fields would have no default in Field()
#     # We default to Optional, so default is usually None
#     default_value = None

#     field_args_str = f"default={default_value}"
#     if args:
#         args_repr = ', '.join(f"{k}={repr(v)}" for k, v in args.items())
#         field_args_str += f", {args_repr}"

#     # Return structure suitable for formatting into Field(...) call
#     # Returning dict for easier manipulation before final string formatting
#     return {'default': default_value, **args}


# # --- MRO Resolution Helper Functions ---

# def get_all_ancestors(graph: rdflib.Graph, class_uri: rdflib.URIRef, known_classes_uris: Set[rdflib.URIRef]) -> Set[rdflib.URIRef]:
#     """Recursively find all superclass URIs for a given class URI within our known set."""
#     ancestors = set()
#     parents = set(graph.objects(subject=class_uri, predicate=RDFS.subClassOf))
#     for parent_uri in parents:
#         if parent_uri in known_classes_uris and parent_uri not in [RDFS.Resource, OWL.Thing]:
#             if parent_uri not in ancestors: # Avoid infinite loops
#                 ancestors.add(parent_uri)
#                 ancestors.update(get_all_ancestors(graph, parent_uri, known_classes_uris))
#     return ancestors

# def get_base_classes(
#      class_info: ClassInfo,
#      all_class_names: Set[str], # Not strictly needed here, using URIs mainly
#      all_class_uris: Set[rdflib.URIRef],
#      graph: rdflib.Graph
#      ) -> List[str]:
#     """Determines the most specific base classes for a Pydantic model, pruning redundant ancestors."""
#     direct_superclass_uris = {sup for sup in class_info.superclasses if sup in all_class_uris and sup not in [RDFS.Resource, OWL.Thing]}
#     if not direct_superclass_uris: return ["BaseModel"]

#     ancestor_map = {sup: get_all_ancestors(graph, sup, all_class_uris) for sup in direct_superclass_uris}
#     minimal_bases_uris = set()
#     for potential_base in direct_superclass_uris:
#         is_ancestor_of_another = False
#         for other_base in direct_superclass_uris:
#             if potential_base != other_base and potential_base in ancestor_map.get(other_base, set()):
#                 is_ancestor_of_another = True; break
#         if not is_ancestor_of_another: minimal_bases_uris.add(potential_base)

#     if not minimal_bases_uris:
#         # If pruning removed everything, it might mean all direct parents were ancestors of others.
#         # Fallback cautiously to direct parents or just BaseModel
#         logger.warning(f"Could not determine minimal bases for {class_info.uri} from {direct_superclass_uris} after pruning. Check hierarchy. Defaulting to direct parents or BaseModel.")
#         # Use direct superclass names if pruning failed, otherwise BaseModel
#         direct_base_names = sorted([map_uri_to_classname(uri) for uri in direct_superclass_uris])
#         return direct_base_names if direct_base_names else ["BaseModel"]
#     else:
#         base_class_names = sorted([map_uri_to_classname(uri) for uri in minimal_bases_uris])
#         return base_class_names


# Step 4: Pydantic Class Code Generation

## Updated Step 4 plus some of 3:

In [325]:
def safe_python_identifier(name: str) -> str:
    if keyword.iskeyword(name): return name + "_"
    if not name or not (name[0].isalpha() or name[0] == '_'): name = '_' + name
    name = re.sub(r'\W|^(?=\d)', '_', name)
    return name

def map_uri_to_classname(uri: rdflib.URIRef) -> str:
    # Simplified version for brevity, assumes SCHEMA namespace primarily
    local_name = uri.replace(SCHEMA, "")
    if not local_name or ':' in local_name or '/' in local_name: # Handle non-schema or complex URIs crudely
        local_name = str(uri).split('/')[-1].split('#')[-1]
    # Convert to CamelCase if needed
    if local_name and local_name[0].islower():
        parts = re.split(r'[-_ ]', local_name)
        local_name = "".join(part.capitalize() for part in parts if part)
    return safe_python_identifier(local_name or "_UnknownClass")

def map_uri_to_fieldname(uri: rdflib.URIRef) -> str:
    local_name = uri.replace(SCHEMA, "")
    if not local_name or ':' in local_name or '/' in local_name:
        local_name = str(uri).split('/')[-1].split('#')[-1]
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', local_name)
    snake_case_name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return safe_python_identifier(snake_case_name or "_unknown_field")

def get_module_path_for_class(class_name: str) -> str:
    """Determines relative module path (snake_case) for a CamelCase class name."""
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', class_name)
    snake_case_name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    module_name_part = safe_python_identifier(snake_case_name)
    return f".{module_name_part}"

def map_range_to_typehint(
    ranges: Set[rdflib.URIRef],
    all_class_names: Set[str],
    base_type_names: Set[str]
) -> str:
    """Maps RDF range URIs to a Python type hint string."""
    if not ranges: return "Any"
    mapped_types = set()
    for r_uri in ranges:
        if r_uri in TYPE_MAP:
            type_name = TYPE_MAP[r_uri]
            if type_name in base_type_names:
                 # Reference base types via the module where they will live
                 mapped_types.add(f"base_types.{type_name}")
            else:
                 mapped_types.add(type_name)
        else:
            class_name = map_uri_to_classname(r_uri)
            if class_name in all_class_names:
                 mapped_types.add(f"'{class_name}'")
            elif str(r_uri) == str(SCHEMA.Thing) or str(r_uri) == str(RDFS.Resource):
                 mapped_types.add("Any")
            else:
                 logger.warning(f"Unknown range URI: {r_uri}. Mapping to Any.")
                 mapped_types.add("Any")

    unique_types = sorted([t for t in mapped_types if t != "Any"])
    if not unique_types: return "Any"

    type_hint_core = ""
    if len(unique_types) == 1:
        type_hint_core = unique_types[0]
    else:
        union_args = []
        for t in unique_types:
             # Clean potential prefixes only for final display in Union, imports handle resolution
             # clean_t = t.replace("base_types.", "") if t.startswith("base_types.") else t
             union_args.append(t) # clean_t
        type_hint_core = f"Union[{', '.join(union_args)}]"
    print(f"type_hint_core: {type_hint_core}")
    # Default to Optional
    is_already_optional = any(t.startswith("Optional[") or t == 'None' for t in unique_types)
    if not is_already_optional:
         return f"Optional[{type_hint_core}]"
    else:
         return type_hint_core

def get_field_metadata(prop_info: PropertyInfo) -> Dict:
    """Generates arguments for pydantic.Field based on PropertyInfo."""
    args = {}
    if prop_info.comment:
        clean_comment = ' '.join(prop_info.comment.split())
        # Escape potential triple quotes in comments for docstrings/descriptions
        clean_comment = clean_comment.replace('"""', '\\"\\"\\"').replace("'''", "\\'\\'\\'")
        args['description'] = clean_comment
    if prop_info.uri in PROPERTY_ALIAS_MAP:
        for alias_type, alias_value in PROPERTY_ALIAS_MAP[prop_info.uri].items():
             args[alias_type] = alias_value
    return {'default': None, **args}

# --- MRO Resolution Helper Functions ---

def get_all_ancestors(graph: rdflib.Graph, class_uri: rdflib.URIRef, known_classes_uris: Set[rdflib.URIRef]) -> Set[rdflib.URIRef]:
    """Recursively find all superclass URIs for a given class URI within our known set."""
    ancestors = set()
    parents = set(graph.objects(subject=class_uri, predicate=RDFS.subClassOf))
    for parent_uri in parents:
        if parent_uri in known_classes_uris and parent_uri not in [RDFS.Resource, OWL.Thing]:
            if parent_uri not in ancestors: # Avoid infinite loops
                ancestors.add(parent_uri)
                ancestors.update(get_all_ancestors(graph, parent_uri, known_classes_uris))
    return ancestors

def get_base_classes(
     class_info: ClassInfo,
     all_class_names: Set[str], # Not strictly needed here, using URIs mainly
     all_class_uris: Set[rdflib.URIRef],
     graph: rdflib.Graph
     ) -> List[str]:
    """Determines the most specific base classes for a Pydantic model, pruning redundant ancestors."""
    direct_superclass_uris = {sup for sup in class_info.superclasses if sup in all_class_uris and sup not in [RDFS.Resource, OWL.Thing]}
    if not direct_superclass_uris: return ["BaseModel"]

    ancestor_map = {sup: get_all_ancestors(graph, sup, all_class_uris) for sup in direct_superclass_uris}
    minimal_bases_uris = set()
    for potential_base in direct_superclass_uris:
        is_ancestor_of_another = False
        for other_base in direct_superclass_uris:
            if potential_base != other_base and potential_base in ancestor_map.get(other_base, set()):
                is_ancestor_of_another = True; break
        if not is_ancestor_of_another: minimal_bases_uris.add(potential_base)

    if not minimal_bases_uris:
        # If pruning removed everything, it might mean all direct parents were ancestors of others.
        # Fallback cautiously to direct parents or just BaseModel
        logger.warning(f"Could not determine minimal bases for {class_info.uri} from {direct_superclass_uris} after pruning. Check hierarchy. Defaulting to direct parents or BaseModel.")
        # Use direct superclass names if pruning failed, otherwise BaseModel
        direct_base_names = sorted([map_uri_to_classname(uri) for uri in direct_superclass_uris])
        return direct_base_names if direct_base_names else ["BaseModel"]
    else:
        base_class_names = sorted([map_uri_to_classname(uri) for uri in minimal_bases_uris])
        return base_class_names


In [326]:
from __future__ import annotations
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, XSD
from typing import List, Set, Dict, Optional, NamedTuple, Union, Any, cast, ForwardRef, TYPE_CHECKING
from pydantic import (
    BaseModel, Field, AnyUrl, field_validator,
    model_validator, condecimal, constr, EmailStr
)
from datetime import date, datetime, time, timedelta
import decimal
import isodate # Requires: pip install isodate
import keyword
import logging
import os
import pathlib
import re
import shlex
import textwrap
import importlib
import pkgutil


In [327]:
# --- Complete Step 4 Script: Schema.org to Pydantic Generator ---


# --- Basic Configuration ---
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(name)s:%(message)s')
logger = logging.getLogger(__name__)

SCHEMA_FILE = "schema.txt" # Assumes schema.txt is in the same directory
SCHEMA_FORMAT = "turtle"
OUTPUT_DIR = "output_ontology"
MODELS_SUBDIR = "models"
BASE_TYPES_FILENAME = "base_types.py" # Filename for base types

SCHEMA = rdflib.Namespace("https://schema.org/")


In [332]:
# --- Code for base_types.py (Essential Context) ---
BASE_TYPES_CODE = """
from __future__ import annotations # Keep first
from pydantic import (
    BaseModel, Field, AnyUrl, field_validator,
    model_validator, condecimal, constr, EmailStr
)
from typing import Optional, List, Union, Any, Annotated
from datetime import date, datetime, time, timedelta
import decimal
import isodate # Requires: pip install isodate
import logging

logger = logging.getLogger(__name__) # Use logger for warnings

class Quantity(BaseModel):
    \"\"\"Base model for schema.org/Quantity.\"""\"\"
    model_config = {'extra': 'allow'}

class Distance(Quantity):
    \"\"\"Represents schema.org/Distance.\"""\"\"
    value: Optional[float] = Field(None, description="The numerical value of the distance.")
    unitCode: Optional[str] = Field(None, description="UN/CEFACT Common Code (3 characters) or URL. E.g., 'MTR', 'KM', 'FT'.")
    unitText: Optional[str] = Field(None, description="A string indicating the unit of measurement.")
    model_config = {'extra': 'forbid'}

class Duration(Quantity):
    \"\"\"Represents schema.org/Duration, parsing ISO 8601 string.\"""\"\"
    value_iso8601: Optional[str] = Field(None, validation_alias='iso8601Duration', serialization_alias='iso8601Duration', description="Duration in ISO 8601 format.")
    value_timedelta: Optional[timedelta] = Field(None, exclude=True, description="Parsed timedelta value (internal).")
    model_config = {'extra': 'forbid', 'populate_by_name': True}

    @model_validator(mode='before')
    @classmethod
    def parse_duration(cls, data: Any) -> Any:
        if isinstance(data, dict):
            iso_duration_str = data.get("value_iso8601") or data.get("iso8601Duration")
            if iso_duration_str and isinstance(iso_duration_str, str) and 'value_timedelta' not in data:
                try:
                    td = isodate.parse_duration(iso_duration_str)
                    data['value_timedelta'] = td
                    data['value_iso8601'] = iso_duration_str
                except (isodate.ISO8601Error, ValueError) as e:
                    logger.warning(f"Could not parse ISO 8601 duration '{iso_duration_str}': {e}")
                    data['value_timedelta'] = None
                    data['value_iso8601'] = iso_duration_str
            elif data.get('value_timedelta') and isinstance(data.get('value_timedelta'), timedelta):
                pass
        elif isinstance(data, str):
             try: 
                 return {'value_iso8601': data, 'value_timedelta': isodate.parse_duration(data)}
             except (isodate.ISO8601Error, ValueError) as e:
                 logger.warning(f"Could not parse ISO 8601 duration string '{data}': {e}")
                 return {'value_iso8601': data, 'value_timedelta': None}
        return data

    @property
    def timedelta(self) -> Optional[timedelta]: return self.value_timedelta
    def __str__(self) -> str: return self.value_iso8601 or "Invalid/Missing Duration"

class DefinedTerm(BaseModel):
    \"\"\"Represents schema.org/DefinedTerm.\"""\"\"
    termCode: Optional[str] = Field(None, description="A code for this DefinedTerm within a DefinedTermSet.")
    name: Optional[str] = Field(None, description="The name of the item.")
    description: Optional[str] = Field(None, description="A description of the item.")
    inDefinedTermSet: Optional[AnyUrl] = Field(None, description="The DefinedTermSet that contains this term.")
    model_config = {'extra': 'allow'}

class Money(BaseModel):
     \"\"\"Represents an amount of money with a currency.\"""\"\"
     amount: Optional[decimal.Decimal] = Field(None, description="The amount of money.")
     currency: Optional[Annotated[str, constr(pattern=r'^[A-Z]{3}$')]] = Field(None, description="ISO 4217 Currency Code")

     @field_validator('amount', mode='before')
     @classmethod
     def clean_amount(cls, v: Any) -> Optional[decimal.Decimal]:
         if isinstance(v, (int, float)):
             try:
                 return decimal.Decimal(v)
             except Exception as e: 
                 logger.error(f"Error converting {v} to Decimal: {e}")
                 raise ValueError(f"Cannot convert {v} to Decimal")
         if isinstance(v, str):
             try: 
                 return decimal.Decimal(v.strip())
             except decimal.InvalidOperation:
                 raise ValueError(f"Invalid decimal format for amount: {v}")
         if isinstance(v, decimal.Decimal) or v is None: 
            return v   
         raise ValueError(f"Unexpected type for amount: {type(v)}")

     model_config = {'extra': 'forbid'}
"""


In [333]:
def generate_pydantic_model_code(
    class_info: ClassInfo,
    properties_info: Dict[rdflib.URIRef, PropertyInfo],
    all_class_names: Set[str],
    all_class_uris: Set[rdflib.URIRef],
    schema_graph: rdflib.Graph, # Added argument
    base_type_names: Set[str]  # Added argument
) -> str:
    """Generates the Python code string for a single Pydantic model, including robust imports and correct base classes."""

    class_name = map_uri_to_classname(class_info.uri)

    # --- Use the Corrected get_base_classes ---
    valid_base_names_list = get_base_classes(class_info, all_class_names, all_class_uris, schema_graph)

    # --- Import Handling Setup ---
    typing_imports_needed = set()
    pydantic_imports_needed = set()
    datetime_imports_needed = set()
    other_stdlib_imports = set()
    cross_module_imports = set() # For forward refs in hints (populated in field loop)
    runtime_base_class_imports = set() # For class def line imports
    rich_type_imports = set()
    needs_base_types_import = False

    # Determine base class string and populate runtime imports
    if not valid_base_names_list or valid_base_names_list == ["BaseModel"]:
        base_class_str = "BaseModel"
        pydantic_imports_needed.add("BaseModel")
    else:
        base_class_str = ", ".join(valid_base_names_list)
        for base_name in valid_base_names_list:
             if base_name != "BaseModel":
                  if base_name != class_name:
                       # Use helper to get module path, ensure it's relative '.'
                       module_path = get_module_path_for_class(base_name)
                       runtime_base_class_imports.add(f"from {module_path} import {base_name}")
                  else: logger.warning(f"Class {class_name} listed itself as base?")

    # --- Analyze Fields ---
    field_definitions = []
    field_added = False
    for prop_uri in sorted(list(class_info.properties)):
        if prop_uri not in properties_info: continue
        prop_info = properties_info[prop_uri]
        field_name = map_uri_to_fieldname(prop_info.uri)
        
        # **** DETECT Need for Rich Types based on RANGES ****
        prop_needs_rich_type = False
        for r_uri in prop_info.ranges:
            if r_uri in base_type_uris: # Check against the set of URIs defined in base_types
                 class_name_for_import = map_uri_to_classname(r_uri)
                 rich_type_imports.add(class_name_for_import)
                 prop_needs_rich_type = True

        # Clean prefixes based on imports that *will* be generated
        type_hint_str = map_range_to_typehint(prop_info.ranges, all_class_names, base_type_names)
        final_type_hint = type_hint_str
        if "Optional" in final_type_hint or "List" in final_type_hint or \
           "Union" in final_type_hint or "Any" in final_type_hint:
            final_type_hint = final_type_hint.replace("typing.", "")
            typing_imports_needed.update(re.findall(r'\b(Optional|List|Union|Any)\b', final_type_hint))
        if "date" in final_type_hint or "datetime" in final_type_hint or \
           "time" in final_type_hint or "timedelta" in final_type_hint:
            final_type_hint = final_type_hint.replace("datetime.", "")
            datetime_imports_needed.update(re.findall(r'\b(date|datetime|time|timedelta)\b', final_type_hint))
        if "AnyUrl" in final_type_hint or "EmailStr" in final_type_hint:
             final_type_hint = final_type_hint.replace("pydantic.", "")
             pydantic_imports_needed.update(re.findall(r'\b(AnyUrl|EmailStr)\b', final_type_hint))
        # Rich types require prefix currently from map_range_to_typehint
        # if "base_types.Quantity" in final_type_hint: rich_type_imports.add("Quantity")
        # if "base_types.Distance" in final_type_hint: rich_type_imports.add("Distance")
        # if "base_types.Duration" in final_type_hint: rich_type_imports.add("Duration")
        # if "base_types.DefinedTerm" in final_type_hint: rich_type_imports.add("DefinedTerm")
        # if "base_types.Money" in final_type_hint: rich_type_imports.add("Money")
        # Remove base_types prefix for the final hint string
        # final_type_hint = final_type_hint.replace("base_types.", "")

        if "Decimal" in final_type_hint: other_stdlib_imports.add("import decimal")
        # if "base_types." in final_type_hint:
        #     needs_base_types_import = True


        # Track cross-module imports needed only for hints
        potential_classes_in_hint = set(re.findall(r"'(\w+)'", final_type_hint))
        for potential_class in potential_classes_in_hint:
             if potential_class in all_class_names and \
                potential_class != class_name and \
                potential_class not in valid_base_names_list:
                 cross_module_imports.add(potential_class)

        field_args_dict = get_field_metadata(prop_info)
        field_args_parts = [repr(field_args_dict.pop('default', None))]
        field_args_parts.extend(f"{k}={repr(v)}" for k, v in field_args_dict.items())
        field_call = f"Field({', '.join(field_args_parts)})"

        field_definitions.append(f"    {field_name}: {final_type_hint} = {field_call}")
        field_added = True

    if field_added: pydantic_imports_needed.add("Field")

    # --- Assemble Code ---
    code_parts = ["from __future__ import annotations"]

    if datetime_imports_needed: code_parts.append(f"from datetime import {', '.join(sorted(list(datetime_imports_needed)))}")
    code_parts.extend(sorted(list(other_stdlib_imports)))

    needs_type_checking_block = bool(cross_module_imports)
    if needs_type_checking_block: typing_imports_needed.add("TYPE_CHECKING")
    if typing_imports_needed: code_parts.append(f"from typing import {', '.join(sorted(list(typing_imports_needed)))}")

    # Add BaseModel if needed, ensure it's not duplicated if already added
    if "BaseModel" not in pydantic_imports_needed and base_class_str == "BaseModel":
         pydantic_imports_needed.add("BaseModel")
    if pydantic_imports_needed: code_parts.append(f"from pydantic import {', '.join(sorted(list(pydantic_imports_needed)))}")
    if needs_base_types_import:
        code_parts.append(f"from . import base_types")
    # if rich_type_imports: 
    code_parts.append(f"from . import base_types")
        # code_parts.append(f"from .base_types import {', '.join(sorted(list(rich_type_imports)))}")

    # Runtime base class imports MUST come before the TYPE_CHECKING block if they are needed by it
    code_parts.extend(sorted(list(runtime_base_class_imports)))

    if needs_type_checking_block:
        code_parts.append("\nif TYPE_CHECKING:")
        for class_to_import in sorted(list(cross_module_imports)):
            module_path = get_module_path_for_class(class_to_import)
            if class_to_import != class_name: code_parts.append(f"    from {module_path} import {class_to_import}")

    code_parts.append("\n")
    class_docstring = f'"""\n    {class_name}: {textwrap.shorten(class_info.comment or "No description provided.", width=70)}\n\n    Generated from: {class_info.uri}\n    """'
    code_parts.append(f"class {class_name}({base_class_str}):")
    code_parts.append(f"    {class_docstring}")
    if not field_definitions: code_parts.append("    pass")
    else: code_parts.extend(field_definitions)
    code_parts.append("\n    model_config = {'extra': 'forbid'}")

    return "\n".join(code_parts) + "\n"



In [334]:

# --- Corrected Ontology Models Generator Function ---
def generate_ontology_models(
     analyzed_schema: Dict[str, Dict],
     output_base_dir: str,
     
     # Added arguments needed by generate_pydantic_model_code
     schema_graph: rdflib.Graph,
     properties_info: Dict[rdflib.URIRef, PropertyInfo],
     models_subdir: str = "models",
     ) -> None:
    """
    Generates Pydantic model files from analyzed schema info, including base_types.py
    and a correctly structured __init__.py.
    """
    classes_info: Dict[rdflib.URIRef, ClassInfo] = analyzed_schema.get("classes", {})
    if not classes_info:
        logging.error("No class information found.")
        return

    output_path = pathlib.Path(output_base_dir) / models_subdir
    output_path.mkdir(parents=True, exist_ok=True)
    (output_path / "__init__.py").touch()

    base_type_names = {"Quantity", "Distance", "Duration", "DefinedTerm", "Money"}
    base_type_uris = { SCHEMA.Quantity, SCHEMA.Distance, SCHEMA.Duration, SCHEMA.DefinedTerm, SCHEMA.Money }
    # base_type_names = {map_uri_to_classname(uri) for uri in base_type_uris if uri in classes_info}
    print(f"base_type_uris: {base_type_uris}")
    print(f"base_type_names: {base_type_names}")

    base_types_path = output_path / BASE_TYPES_FILENAME
    try:
        # Ensure BASE_TYPES_CODE is accessible
        if 'BASE_TYPES_CODE' not in globals(): raise NameError("BASE_TYPES_CODE string not found.")
        with open(base_types_path, "w", encoding="utf-8") as f: f.write(BASE_TYPES_CODE)
        logging.info(f"Generated base types file: {base_types_path}")
    except Exception as e:
        logging.error(f"Failed to write {BASE_TYPES_FILENAME}: {e}", exc_info=True)
        return

    all_class_names = {map_uri_to_classname(uri) for uri in classes_info.keys()}
    all_class_names.update(base_type_names)
    all_class_uris = set(classes_info.keys()) # Needed for get_base_classes

    generation_order_uris = sorted(list(classes_info.keys()))
    logging.info(f"Attempting generation for {len(generation_order_uris)} classes...")

    generated_files = 0
    module_to_class_map: Dict[str, str] = {}

    for class_uri in generation_order_uris:
        if class_uri not in classes_info: continue
        if class_uri in base_type_uris: continue # Skip base types

        class_info = classes_info[class_uri]
        class_name = map_uri_to_classname(class_uri)
        module_name_part = map_uri_to_fieldname(class_uri)

        try:
            model_code = generate_pydantic_model_code(
                class_info=class_info,
                properties_info=properties_info,
                all_class_names=all_class_names,
                all_class_uris=all_class_uris,
                schema_graph=schema_graph,
                base_type_names=base_type_names
            )
            file_path = output_path / f"{module_name_part}.py"
            with open(file_path, "w", encoding="utf-8") as f: f.write(model_code)
            module_to_class_map[module_name_part] = class_name
            generated_files += 1
        except Exception as e:
            logging.error(f"Failed to generate code for class {class_name} ({class_uri}): {e}", exc_info=True)

    logging.info(f"Finished generating {generated_files} specific Pydantic model files in {output_path}")

    # --- Corrected __init__.py Generation ---
    init_py_path = output_path / "__init__.py"
    try:
        with open(init_py_path, "w", encoding="utf-8") as f:
            f.write("# flake8: noqa\n")
            f.write("# Auto-generated __init__.py\n\n")
            f.write("import logging\nimport importlib\nimport pkgutil\n")
            f.write("from typing import TYPE_CHECKING\nfrom pydantic import BaseModel\n\n")
            f.write("logger: logging.Logger = logging.getLogger(__name__)\n\n")

            if base_type_names:
                f.write("# --- Import Base Types ---\n")
                f.write("try:\n")
                f.write(f"    from .{BASE_TYPES_FILENAME[:-3]} import ({', '.join(sorted(list(base_type_names)))})\n")
                f.write("except ImportError as e_base:\n") # Use specific exception variable
                f.write("    logger.warning(f'Could not import base_types: {e_base}')\n\n")

            if module_to_class_map: f.write("# --- Import Generated Models ---\n")
            generated_class_names = set()
            for module_name_part, class_name in sorted(module_to_class_map.items()):
                 if class_name not in base_type_names:
                     f.write(f"try:\n")
                     f.write(f"    from .{module_name_part} import {class_name}\n")
                     generated_class_names.add(class_name)
                     f.write(f"except ImportError as e_mod:\n") # Use specific exception variable
                     f.write(f"    logger.warning(f'Could not import {class_name} from .{module_name_part}: {{e_mod}}')\n")

            all_names = sorted(list(base_type_names | generated_class_names))
            f.write("\n__all__ = [\n")
            for name in all_names: f.write(f'    "{name}",\n')
            f.write("]\n\n")

            f.write("# --- Rebuild models to resolve forward references ---\n")
            f.write("def rebuild_all() -> None:\n")
            f.write("    package_name = __name__\n")
            f.write("    package = importlib.import_module(package_name)\n")
            f.write("    rebuilt_models = set()\n")
            f.write("    if not hasattr(package, '__path__'): return\n") # Ensure it's a package
            f.write('    logger.debug(f"Attempting model rebuild in {package_name}")\n\n')
            f.write("    for loader, module_name, is_pkg in pkgutil.iter_modules(package.__path__, package_name + '.') :\n") # Use correct iter_modules signature
            f.write("        if is_pkg: continue # Don't try to rebuild packages like 'tests'\n")
            f.write("        # Skip __init__ itself and base_types\n")
            f.write("        if module_name.endswith('.__init__') or module_name.endswith('.base_types'):\n")
            f.write("            continue\n")
            f.write("        try:\n")
            f.write("            module = importlib.import_module(module_name)\n")
            f.write("            for attribute_name in dir(module):\n")
            f.write("                try: # Add inner try/except for attribute access/check\n")
            f.write("                    attribute = getattr(module, attribute_name)\n")
            f.write("                    if (isinstance(attribute, type) and\n")
            f.write("                            issubclass(attribute, BaseModel) and\n")
            f.write("                            attribute is not BaseModel and\n")
            f.write("                            hasattr(attribute, 'model_rebuild') and\n")
            f.write("                            attribute not in rebuilt_models):\n")
            f.write("                        try:\n")
            f.write('                            logger.debug(f"Rebuilding: {attribute.__name__} in {module_name}")\n')
            f.write("                            attribute.model_rebuild(force=True)\n")
            f.write("                            rebuilt_models.add(attribute)\n")
            f.write("                        except Exception as e_rebuild:\n")
            f.write("                            logger.error(f'Error rebuilding model {attribute.__name__} in {module_name}: {e_rebuild}', exc_info=False)\n")
            f.write("                except Exception as e_getattr: # Catch errors during getattr/issubclass\n")
            f.write("                     # logger.debug(f'Skipping attribute {attribute_name} in {module_name}: {e_getattr}')\n")
            f.write("                     pass # Ignore attributes that cause errors during introspection\n")
            f.write("        except ModuleNotFoundError:\n")
            f.write("            logger.warning(f\"Module not found during rebuild: {module_name}\")\n")
            f.write("        except Exception as e_import:\n")
            f.write("             logger.error(f'Error processing module {module_name} during rebuild: {e_import}', exc_info=False)\n\n")

            f.write("# Run rebuild automatically on import\n")
            f.write("try:\n")
            f.write("    rebuild_all()\n")
            f.write("    logger.info(f'Pydantic models in {__name__} package rebuilt.')\n")
            f.write("except Exception as e_global:\n")
            f.write("    logger.error(f'Global error during model rebuild: {e_global}', exc_info=True)\n")

        logging.info(f"Successfully generated __init__.py at {init_py_path}")
    except Exception as e:
        logging.error(f"Failed to write __init__.py: {e}", exc_info=True)



In [335]:
# --- Main Execution Block ---
if __name__ == "__main__":
    # Assume parse_schema_to_graph and analyze_schema_graph are defined correctly above
    # Also assume mapping helpers are defined (map_uri_to_classname etc.)

    logger.info("Starting Schema.org to Pydantic Conversion Process...")

    schema_graph_main = parse_schema_to_graph(SCHEMA_FILE, SCHEMA_FORMAT)
    if not schema_graph_main:
        logger.critical("Failed to parse schema graph. Exiting.")
        exit(1)

    analyzed_schema_main = analyze_schema_graph(schema_graph_main)
    if not analyzed_schema_main or not analyzed_schema_main.get("classes"):
        logger.critical("Schema analysis failed or yielded no classes. Exiting.")
        exit(1)

    properties_info_main: Dict[rdflib.URIRef, PropertyInfo] = analyzed_schema_main.get("properties", {})
    if not properties_info_main:
        logger.warning("No property information found during analysis.")
        # Continue, models will just lack fields
    # print(f"Analyzed Schema Main: {analyzed_schema_main}")

    try:
         generate_ontology_models(
             analyzed_schema=analyzed_schema_main,
             output_base_dir=OUTPUT_DIR,
             models_subdir=MODELS_SUBDIR,
             # Pass the required arguments
             schema_graph=schema_graph_main,
             properties_info=properties_info_main
         )
         logger.info("Ontology generation process finished.")
         logger.info(f"Generated models located in: {pathlib.Path(OUTPUT_DIR) / MODELS_SUBDIR}")
         logger.info("Proceed with Step 5: Post-Processing & Verification.")
    except Exception as e:
         logger.critical(f"Ontology generation failed: {e}", exc_info=True)
         exit(1)

    # Step 5 (Post-Processing) would typically be run after this script finishes
    # e.g., by calling run_post_processing_pipeline() if it's defined here,
    # or by running the shell script / commands separately.

INFO:__main__:Starting Schema.org to Pydantic Conversion Process...
INFO:root:Attempting to parse schema file: schema.txt (format: turtle)
INFO:root:Successfully parsed 8904 triples.
INFO:root:Found 628 potential schema.org classes.
INFO:root:Found 921 potential schema.org properties.
INFO:root:Analyzed 628 classes and 921 properties.
INFO:root:Generated base types file: output_ontology/models/base_types.py
INFO:root:Attempting generation for 628 classes...
INFO:root:Finished generating 625 specific Pydantic model files in output_ontology/models
INFO:root:Successfully generated __init__.py at output_ontology/models/__init__.py
INFO:__main__:Ontology generation process finished.
INFO:__main__:Generated models located in: output_ontology/models
INFO:__main__:Proceed with Step 5: Post-Processing & Verification.


base_type_uris: {rdflib.term.URIRef('https://schema.org/Distance'), rdflib.term.URIRef('https://schema.org/Duration'), rdflib.term.URIRef('https://schema.org/DefinedTerm'), rdflib.term.URIRef('https://schema.org/Money'), rdflib.term.URIRef('https://schema.org/Quantity')}
base_type_names: {'Duration', 'Money', 'Quantity', 'Distance', 'DefinedTerm'}
type_hint_core: str
type_hint_core: str
type_hint_core: str
type_hint_core: str
type_hint_core: str
type_hint_core: 'LocationFeatureSpecification'
type_hint_core: Union['BedDetails', 'BedType', str]
type_hint_core: 'QuantitativeValue'
type_hint_core: Union['QuantitativeValue', float]
type_hint_core: 'QuantitativeValue'
type_hint_core: str
type_hint_core: Union[bool, str]
type_hint_core: 'HowTo'
type_hint_core: 'ActionStatusType'
type_hint_core: Union['Organization', 'Person']
type_hint_core: Union[datetime.datetime, datetime.time]
type_hint_core: 'Thing'
type_hint_core: 'Thing'
type_hint_core: Union['Place', 'PostalAddress', str]
type_hint_co

In [213]:
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, XSD
from typing import List, Set, Dict, Optional, NamedTuple, Union, ForwardRef, Any, cast, TYPE_CHECKING # Added TYPE_CHECKING
from pydantic import BaseModel, Field, EmailStr, AnyUrl
from datetime import date, datetime, time, timedelta
import isodate
import decimal
import keyword
import logging
import os
import pathlib
from collections import defaultdict
import textwrap

# --- Assume functions and NamedTuples from Chunks 1, 2, 3 are available ---
# parse_schema_to_graph, analyze_schema_graph
# PropertyInfo, ClassInfo
# TYPE_MAP, PROPERTY_ALIAS_MAP
# safe_python_identifier, map_uri_to_classname, map_uri_to_fieldname,
# map_range_to_typehint, get_field_metadata, get_base_classes

# --- Configuration ---
OUTPUT_DIR = "output_ontology"
MODELS_SUBDIR = "models"
BASE_ONTOLOGY_MODULE = "core_ontology_v0_1" # Used for potential future imports if split

# --- Helper for Code Generation ---

# --- Helper to get import paths (adjust module structure if needed) ---
def get_module_path_for_class(class_name: str) -> str:
    """Determines the expected module name for a given class name."""
    # Assumes snake_case filename based on class name
    # This might need adjustment if filename generation logic changes
    potential_field_name = map_uri_to_fieldname(SCHEMA[class_name]) # Hacky way to get snake_case
    return f".{safe_python_identifier(potential_field_name)}" # Relative import

# --- Revised Code Generation Function ---

# def generate_pydantic_model_code(
#     class_info: ClassInfo,
#     properties_info: Dict[rdflib.URIRef, PropertyInfo],
#     all_class_names: Set[str] # All class names being generated
# ) -> str:
#     """Generates the Python code string for a single Pydantic model, including imports."""

#     class_name = map_uri_to_classname(class_info.uri)
#     base_uris = class_info.superclasses
#     # Map base URIs to class names, filtering for known/generated classes
#     potential_base_names = {map_uri_to_classname(uri) for uri in base_uris}
#     valid_base_names = sorted([name for name in potential_base_names if name in all_class_names and name != 'Thing'])

#     if not valid_base_names:
#          base_class_str = "BaseModel" # Inherit directly from BaseModel
#          imports = {"from pydantic import BaseModel, Field"}
#     else:
#          base_class_str = ", ".join(valid_base_names)
#          imports = {"from pydantic import BaseModel, Field"}
#          # Add imports for base classes (might need TYPE_CHECKING block)
#          for base_name in valid_base_names:
#              imports.add(f"from {get_module_path_for_class(base_name)} import {base_name}")


#     # Standard library imports potentially needed by type hints
#     std_imports = set()
#     typing_imports = {"Optional", "List", "Union", "Any", "TYPE_CHECKING"} # Always import TYPE_CHECKING now

#     field_definitions = []
#     field_type_imports = set() # Track imports needed for field types

#     sorted_property_uris = sorted(list(class_info.properties))

#     for prop_uri in sorted_property_uris:
#         # ... (rest of property analysis and field generation logic is largely the same as before) ...
#         if prop_uri not in properties_info:
#             logging.warning(f"Property {prop_uri} used by {class_name} not found. Skipping.")
#             continue

#         prop_info = properties_info[prop_uri]
#         field_name = map_uri_to_fieldname(prop_info.uri)
#         type_hint_str = map_range_to_typehint(prop_info.ranges, all_class_names)

#         # Track imports needed based on type hint
#         if "datetime." in type_hint_str: std_imports.add("import datetime")
#         if "decimal." in type_hint_str: std_imports.add("import decimal")
#         if "timedelta" in type_hint_str: std_imports.add("from datetime import timedelta") # Specifically timedelta
#         if "pydantic." in type_hint_str:
#             if "AnyUrl" in type_hint_str: imports.add("from pydantic import AnyUrl")
#             if "EmailStr" in type_hint_str: imports.add("from pydantic import EmailStr")
#             # Potentially add others like ConstrainedStr if defined via pydantic
#         for t in typing_imports:
#              # Check if typing constructs are actually used in the final hint string
#              # Basic check, might need refinement
#              if t in type_hint_str and t != 'TYPE_CHECKING': # Don't add TYPE_CHECKING to main import list
#                  std_imports.add(f"from typing import {t}")

#         # Track potential forward references / cross-module imports needed for type hints
#         # Extract potential class names from the type hint string (simple regex approach)
#         # A more robust way involves analyzing the mapped types *before* creating the string
#         potential_classes_in_hint = set(re.findall(r"'(\w+)'", type_hint_str))
#         for potential_class in potential_classes_in_hint:
#              if potential_class in all_class_names and potential_class != class_name:
#                  field_type_imports.add(f"from {get_module_path_for_class(potential_class)} import {potential_class}")


#         field_args_dict = get_field_metadata(prop_info)
#         field_args_parts = []
#         default_val_repr = repr(field_args_dict.pop('default', None))
#         field_args_parts.append(default_val_repr)
#         field_args_parts.extend(f"{k}={repr(v)}" for k, v in field_args_dict.items())
#         field_call = f"Field({', '.join(field_args_parts)})"

#         field_definitions.append(f"    {field_name}: {type_hint_str} = {field_call}")

#     # --- Assemble the full class code ---
#     code_parts = []

#     # Add __future__ import first
#     code_parts.append("from __future__ import annotations")

#     # Add standard library imports
#     code_parts.extend(sorted(list(std_imports)))

#     # Add base pydantic and potentially other direct imports
#     code_parts.extend(sorted([imp for imp in imports if not imp.startswith("from .")]))

#     # Add forward reference imports within TYPE_CHECKING block
#     if field_type_imports or any(imp.startswith("from .") for imp in imports):
#         code_parts.append("\nif TYPE_CHECKING:")
#         # Add base class imports if they are from other modules
#         for imp in sorted(list(imports)):
#              if imp.startswith("from ."):
#                  code_parts.append(f"    {imp}")
#         # Add field type imports
#         for imp in sorted(list(field_type_imports)):
#              code_parts.append(f"    {imp}")
#         code_parts.append("\n")


#     # Add class definition
#     class_docstring = f'"""\n    {class_name}: {textwrap.shorten(class_info.comment or "No description provided.", width=70)}\n\n    Generated from: {class_info.uri}\n    """'
#     # Ensure base classes needed at runtime are imported directly if possible
#     # This logic might need refinement - base classes might need direct import
#     # outside TYPE_CHECKING if Python's MRO needs them immediately.
#     # For now, assume TYPE_CHECKING handles most cases for hints + model_rebuild.
#     runtime_base_imports = []
#     actual_base_classes_str = base_class_str
#     # A simple attempt: if inheriting directly from BaseModel no runtime import needed here
#     # If inheriting from others, assume they are imported via __init__.py / TYPE_CHECKING + model_rebuild
#     # This part is tricky without full MRO analysis during generation.


#     code_parts.append(f"class {class_name}({actual_base_classes_str}):")
#     code_parts.append(f"    {class_docstring}")

#     if not field_definitions:
#         code_parts.append("    pass")
#     else:
#         code_parts.extend(field_definitions)

#     code_parts.append("\n    model_config = {'extra': 'forbid'}")

#     return "\n".join(code_parts) + "\n"


# def generate_ontology_models(
#      analyzed_schema: Dict[str, Dict],
#      output_base_dir: str,
#      models_subdir: str = "models"
#      ):
#     """Generates Pydantic model files from analyzed schema info."""

#     classes_info: Dict[rdflib.URIRef, ClassInfo] = analyzed_schema.get("classes", {})
#     properties_info: Dict[rdflib.URIRef, PropertyInfo] = analyzed_schema.get("properties", {})

#     if not classes_info or not properties_info:
#         logging.error("No class or property information found in analyzed schema.")
#         return

#     # Create output directory
#     output_path = pathlib.Path(output_base_dir) / models_subdir
#     output_path.mkdir(parents=True, exist_ok=True)
#     # Create __init__.py file
#     (output_path / "__init__.py").touch()

#     # Get all class names we intend to generate for forward ref handling
#     all_class_names = {map_uri_to_classname(uri) for uri in classes_info.keys()}

#     # Simple approach to handle inheritance order: generate multiple times or sort topologically.
#     # For simplicity here, we'll just iterate. Multiple passes might be needed in a robust tool
#     # or a proper topological sort based on the 'superclasses' links.
#     generated_files = 0
#     for class_uri, class_info in classes_info.items():
#         class_name = map_uri_to_classname(class_uri)
#         try:
#             model_code = generate_pydantic_model_code(class_info, properties_info, all_class_names)

#             # Write to file
#             file_path = output_path / f"{map_uri_to_fieldname(class_uri)}.py" # Use snake_case for filenames
#             with open(file_path, "w", encoding="utf-8") as f:
#                 f.write(model_code)
#             generated_files += 1
#         except Exception as e:
#             logging.error(f"Failed to generate code for class {class_name} ({class_uri}): {e}")

#     logging.info(f"Generated {generated_files} Pydantic model files in {output_path}")

#     # Add model_rebuild calls to __init__.py for resolving ForwardRefs
#     # This is crucial if models have circular dependencies
#     init_py_path = output_path / "__init__.py"
#     with open(init_py_path, "a", encoding="utf-8") as f:
#         f.write("\n# --- Rebuild models to resolve forward references ---\n")
#         f.write("import importlib\n")
#         f.write("import pkgutil\n")
#         f.write("import logging\n\n")
#         f.write("logger = logging.getLogger(__name__)\n\n")
#         f.write("def rebuild_all():\n")
#         f.write("    package_name = __name__\n")
#         f.write("    package = importlib.import_module(package_name)\n")
#         f.write("    for _, module_name, _ in pkgutil.iter_modules(package.__path__, package_name + '.') :\n")
#         f.write("        try:\n")
#         f.write("            module = importlib.import_module(module_name)\n")
#         f.write("            for attribute_name in dir(module):\n")
#         f.write("                attribute = getattr(module, attribute_name)\n")
#         f.write("                if isinstance(attribute, type) and issubclass(attribute, BaseModel) and attribute is not BaseModel:\n")
#         f.write("                    try:\n")
#         f.write("                        attribute.model_rebuild(force=True)\n")
#         f.write("                        # logger.debug(f'Rebuilt model {attribute.__name__}')\n")
#         f.write("                    except Exception as e_rebuild:\n")
#         f.write("                        logger.error(f'Error rebuilding model {attribute.__name__} in {module_name}: {e_rebuild}', exc_info=True)\n")
#         f.write("        except Exception as e_import:\n")
#         f.write("             logger.error(f'Error importing module {module_name}: {e_import}', exc_info=True)\n")
#         f.write("\n")
#         f.write("try:\n")
#         f.write("    from pydantic import BaseModel\n")
#         f.write("    rebuild_all()\n")
#         f.write("    logger.info(f'Pydantic models in {__name__} rebuilt successfully.')\n")
#         f.write("except ImportError:\n")
#         f.write("    logger.warning('Pydantic not installed, skipping model rebuild.')\n")
#         f.write("except Exception as e_global:\n")
#         f.write("    logger.error(f'Global error during model rebuild: {e_global}', exc_info=True)\n")





# Attempt V3: Not Working

In [214]:
# Ensure these imports are present at the top of your script
import rdflib
from rdflib.namespace import RDFS, OWL
from typing import List, Set
import logging

# Assume SCHEMA namespace and map_uri_to_classname helper are defined
# Assume ClassInfo NamedTuple is defined

logger = logging.getLogger(__name__)

def get_all_ancestors(graph: rdflib.Graph, class_uri: rdflib.URIRef, known_classes_uris: Set[rdflib.URIRef]) -> Set[rdflib.URIRef]:
    """Recursively find all superclass URIs for a given class URI within our known set."""
    ancestors = set()
    parents = set(graph.objects(subject=class_uri, predicate=RDFS.subClassOf))
    for parent_uri in parents:
        if parent_uri in known_classes_uris and parent_uri not in [RDFS.Resource, OWL.Thing]:
            if parent_uri not in ancestors: # Avoid infinite loops
                ancestors.add(parent_uri)
                ancestors.update(get_all_ancestors(graph, parent_uri, known_classes_uris))
    return ancestors

def get_base_classes(
     class_info: ClassInfo,
     all_class_names: Set[str], # Names are used for filtering potentially? Keep for now.
     all_class_uris: Set[rdflib.URIRef], # URIs of all generated classes
     graph: rdflib.Graph
     ) -> List[str]:
    """Determines the most specific base classes for a Pydantic model, pruning redundant ancestors."""
    # Find direct superclass URIs that are part of the set we are generating
    direct_superclass_uris = {sup for sup in class_info.superclasses if sup in all_class_uris and sup not in [RDFS.Resource, OWL.Thing]}

    if not direct_superclass_uris:
        return ["BaseModel"]

    # Find all known ancestors for each direct superclass
    ancestor_map = {sup: get_all_ancestors(graph, sup, all_class_uris) for sup in direct_superclass_uris}

    # Pruning logic: Keep base 'B' if it is not an ancestor of any *other* direct base 'A'.
    minimal_bases_uris = set()
    for potential_base in direct_superclass_uris:
        is_ancestor_of_another = False
        for other_base in direct_superclass_uris:
            if potential_base != other_base and potential_base in ancestor_map.get(other_base, set()):
                is_ancestor_of_another = True
                break
        if not is_ancestor_of_another:
            minimal_bases_uris.add(potential_base)

    if not minimal_bases_uris:
         logger.warning(f"Could not determine minimal bases for {class_info.uri} from {direct_superclass_uris}, defaulting to BaseModel.")
         return ["BaseModel"]
    else:
         # Map the minimal URIs to class names
         base_class_names = sorted([map_uri_to_classname(uri) for uri in minimal_bases_uris])
         # Ensure BaseModel is implicitly handled by Pydantic's MRO via the listed bases
         return base_class_names

In [215]:
# Not working: Assumes necessary imports (typing, pydantic, datetime, etc.) are handled correctly
# Assumes helper functions map_*, get_metadata, get_module_path are defined

def generate_pydantic_model_code(
    class_info: ClassInfo,
    properties_info: Dict[rdflib.URIRef, PropertyInfo],
    all_class_names: Set[str], # All generated class names (including base types)
    # *** ADDED ARGUMENTS ***
    all_class_uris: Set[rdflib.URIRef], # All generated class URIs
    schema_graph: rdflib.Graph,      # The parsed RDF graph
    base_type_names: Set[str]       # Names defined in base_types.py
) -> str:
    """Generates the Python code string for a single Pydantic model, including robust imports and correct base classes."""

    class_name = map_uri_to_classname(class_info.uri)

    # --- *** MODIFICATION: Call get_base_classes *** ---
    valid_base_names_list = get_base_classes(class_info, all_class_names, all_class_uris, schema_graph)
    # --- *** END MODIFICATION *** ---

    # --- Import Handling Setup ---
    typing_imports_needed = set()
    pydantic_imports_needed = set()
    datetime_imports_needed = set()
    other_stdlib_imports = set()
    cross_module_imports = set() # For TYPE_CHECKING block (forward refs in hints)
    runtime_base_class_imports = set() # For class definition line imports
    rich_type_imports = set() # For base_types like Quantity

    # Determine base class string and imports needed for bases
    if not valid_base_names_list or valid_base_names_list == ["BaseModel"]:
        base_class_str = "BaseModel"
        pydantic_imports_needed.add("BaseModel")
    else:
        base_class_str = ", ".join(valid_base_names_list)
        # Add imports needed for the base classes themselves at runtime
        for base_name in valid_base_names_list:
             if base_name != "BaseModel": # Prevent importing BaseModel relatively
                  if base_name != class_name:
                       runtime_base_class_imports.add(f"from .{get_module_path_for_class(base_name)} import {base_name}")
                  else:
                       logger.warning(f"Class {class_name} listed itself as a base? Skipping runtime import.")

    # --- Analyze Fields (Populate field_definitions and track imports) ---
    # (This part remains the same as the previous working version that fixed F401 errors)
    field_definitions = []
    field_added = False

    sorted_property_uris = sorted(list(class_info.properties))

    for prop_uri in sorted_property_uris:
        if prop_uri not in properties_info: continue

        prop_info = properties_info[prop_uri]
        field_name = map_uri_to_fieldname(prop_info.uri)
        # Pass base_type_names to the mapping function now
        type_hint_str = map_range_to_typehint(prop_info.ranges, all_class_names, base_type_names)

        # Clean prefixes and track actual usage for imports
        final_type_hint = type_hint_str # Start with original
        # Basic prefix removal - assumes map_range_to_typehint handles base_types prefix correctly
        final_type_hint = final_type_hint.replace("typing.", "")
        final_type_hint = final_type_hint.replace("datetime.", "")
        final_type_hint = final_type_hint.replace("pydantic.", "")

        # Track imports based on final usage
        if "Optional" in final_type_hint: typing_imports_needed.add("Optional")
        if "List" in final_type_hint: typing_imports_needed.add("List")
        if "Union" in final_type_hint: typing_imports_needed.add("Union")
        if "Any" in final_type_hint: typing_imports_needed.add("Any")
        if "date" in final_type_hint: datetime_imports_needed.add("date")
        if "datetime" in final_type_hint: datetime_imports_needed.add("datetime")
        if "time" in final_type_hint: datetime_imports_needed.add("time")
        if "timedelta" in final_type_hint: datetime_imports_needed.add("timedelta")
        if "Decimal" in final_type_hint: other_stdlib_imports.add("import decimal")
        if "AnyUrl" in final_type_hint: pydantic_imports_needed.add("AnyUrl")
        if "EmailStr" in final_type_hint: pydantic_imports_needed.add("EmailStr")
        # Rich types - import from base_types
        if "Quantity" in final_type_hint: rich_type_imports.add("Quantity")
        if "Distance" in final_type_hint: rich_type_imports.add("Distance")
        if "Duration" in final_type_hint: rich_type_imports.add("Duration")
        if "DefinedTerm" in final_type_hint: rich_type_imports.add("DefinedTerm")
        if "Money" in final_type_hint: rich_type_imports.add("Money")

        # Track cross-module imports needed for forward reference hints ('ClassName')
        potential_classes_in_hint = set(re.findall(r"'(\w+)'", final_type_hint))
        for potential_class in potential_classes_in_hint:
             # Add if it's a known class, not current, and not already a direct (runtime) base
             if potential_class in all_class_names and potential_class != class_name and potential_class not in valid_base_names_list:
                 cross_module_imports.add(potential_class)

        # Generate Field(...) call
        field_args_dict = get_field_metadata(prop_info)
        field_args_parts = [repr(field_args_dict.pop('default', None))]
        field_args_parts.extend(f"{k}={repr(v)}" for k, v in field_args_dict.items())
        field_call = f"Field({', '.join(field_args_parts)})"

        field_definitions.append(f"    {field_name}: {final_type_hint} = {field_call}")
        field_added = True

    if field_added: pydantic_imports_needed.add("Field")

    # --- Assemble Code ---
    code_parts = ["from __future__ import annotations"]

    # Add standard library imports
    if datetime_imports_needed:
        code_parts.append(f"from datetime import {', '.join(sorted(list(datetime_imports_needed)))}")
    code_parts.extend(sorted(list(other_stdlib_imports)))

    # Add typing imports (conditionally add TYPE_CHECKING)
    typehint_only_imports = cross_module_imports # Bases are imported via runtime_base_class_imports
    needs_type_checking_block = bool(typehint_only_imports)
    if needs_type_checking_block:
        typing_imports_needed.add("TYPE_CHECKING")
    if typing_imports_needed:
        code_parts.append(f"from typing import {', '.join(sorted(list(typing_imports_needed)))}")

    # Add Pydantic imports
    if pydantic_imports_needed:
        code_parts.append(f"from pydantic import {', '.join(sorted(list(pydantic_imports_needed)))}")

    # Add Rich type imports from base_types
    if rich_type_imports:
         code_parts.append(f"from .base_types import {', '.join(sorted(list(rich_type_imports)))}")

    # Add Runtime base class imports (already formatted with 'from .module import Class')
    code_parts.extend(sorted(list(runtime_base_class_imports)))

    # Generate TYPE_CHECKING block ONLY if needed
    if needs_type_checking_block:
        code_parts.append("\nif TYPE_CHECKING:")
        for class_to_import in sorted(list(typehint_only_imports)):
            module_path = get_module_path_for_class(class_to_import)
            if class_to_import != class_name:
                 code_parts.append(f"    from {module_path} import {class_to_import}")

    # Class definition
    code_parts.append("\n")
    class_docstring = f'"""\n    {class_name}: {textwrap.shorten(class_info.comment or "No description provided.", width=70)}\n\n    Generated from: {class_info.uri}\n    """'
    # Use the calculated base_class_str which has pruned bases
    code_parts.append(f"class {class_name}({base_class_str}):")
    code_parts.append(f"    {class_docstring}")

    if not field_definitions:
        code_parts.append("    pass")
    else:
        code_parts.extend(field_definitions)

    code_parts.append("\n    model_config = {'extra': 'forbid'}")

    return "\n".join(code_parts) + "\n"

In [221]:
BASE_TYPES_FILENAME = "base_types.py"

In [222]:
# Assumes necessary imports and helper functions are defined above it

def generate_ontology_models(
     analyzed_schema: Dict[str, Dict],
     output_base_dir: str,
     # *** Added required arguments ***
     schema_graph: rdflib.Graph,
     properties_info: Dict[rdflib.URIRef, PropertyInfo], # Pass properties_info explicitly
     models_subdir: str = "models",
     ) -> None:
    """
    Generates Pydantic model files from analyzed schema info, including base_types.py
    and a correctly structured __init__.py.
    """
    classes_info: Dict[rdflib.URIRef, ClassInfo] = analyzed_schema.get("classes", {})
    # properties_info is now passed directly

    if not classes_info:
        logging.error("No class information found in analyzed schema. Cannot generate models.")
        return

    output_path = pathlib.Path(output_base_dir) / MODELS_SUBDIR # Use constant
    output_path.mkdir(parents=True, exist_ok=True)
    (output_path / "__init__.py").touch()

    base_type_uris = {
        SCHEMA.Quantity, SCHEMA.Distance, SCHEMA.Duration, SCHEMA.DefinedTerm, SCHEMA.Money
    }
    base_type_names = {map_uri_to_classname(uri) for uri in base_type_uris if uri in classes_info}

    # --- Generate base_types.py ---
    base_types_path = output_path / BASE_TYPES_FILENAME
    try:
        global BASE_TYPES_CODE # Ensure this is defined/accessible
        if 'BASE_TYPES_CODE' not in globals():
             raise NameError("BASE_TYPES_CODE string not found.")
        with open(base_types_path, "w", encoding="utf-8") as f:
            f.write(BASE_TYPES_CODE)
        logging.info(f"Generated base types file: {base_types_path}")
    except Exception as e:
        logging.error(f"Failed to write {BASE_TYPES_FILENAME}: {e}", exc_info=True)
        return

    all_class_names = {map_uri_to_classname(uri) for uri in classes_info.keys()}
    all_class_names.update(base_type_names)
    all_class_uris = set(classes_info.keys()) # Needed for get_base_classes

    generation_order_uris = sorted(list(classes_info.keys()))
    logging.info("Generating models in alphabetical order by URI.")


    # --- Generate individual model files ---
    generated_files = 0
    module_to_class_map: Dict[str, str] = {} # Store module_name_part -> ClassName

    for class_uri in generation_order_uris:
        if class_uri not in classes_info: continue
        if class_uri in base_type_uris: continue # Skip base types

        class_info = classes_info[class_uri]
        class_name = map_uri_to_classname(class_uri)
        module_name_part = map_uri_to_fieldname(class_uri)

        try:
            # **** CORRECTED CALL: Pass all required arguments ****
            model_code = generate_pydantic_model_code(
                class_info=class_info,
                properties_info=properties_info, # Pass explicitly
                all_class_names=all_class_names,
                all_class_uris=all_class_uris,   # Pass URIs
                schema_graph=schema_graph,       # Pass graph
                base_type_names=base_type_names  # Pass base names
            )
            # **** END CORRECTION ****

            file_path = output_path / f"{module_name_part}.py"
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(model_code)

            module_to_class_map[module_name_part] = class_name # Store mapping
            generated_files += 1

        except Exception as e:
            logging.error(f"Failed to generate code for class {class_name} ({class_uri}): {e}", exc_info=True)

    logging.info(f"Generated {generated_files} specific Pydantic model files in {output_path}")


    # --- Corrected __init__.py Generation (Using module_to_class_map) ---
    init_py_path = output_path / "__init__.py"
    try:
        with open(init_py_path, "w", encoding="utf-8") as f:
            f.write("# flake8: noqa\n")
            f.write("# Auto-generated __init__.py for ontology models\n\n")
            f.write("import logging\nimport importlib\nimport pkgutil\n")
            f.write("from typing import TYPE_CHECKING\nfrom pydantic import BaseModel\n\n")
            f.write("logger: logging.Logger = logging.getLogger(__name__)\n\n")

            # 1. Import from base_types
            if base_type_names:
                f.write("# --- Import Base Types ---\n")
                f.write("try:\n")
                f.write(f"    from .{BASE_TYPES_FILENAME[:-3]} import ({', '.join(sorted(list(base_type_names)))})\n") # Use filename w/o .py
                f.write("except ImportError:\n")
                f.write("    logger.warning('Could not import base_types')\n\n")

            # 2. Import from other generated modules using the map
            if module_to_class_map:
                 f.write("# --- Import Generated Models ---\n")
            generated_class_names = set()
            for module_name_part, class_name in sorted(module_to_class_map.items()):
                 if class_name not in base_type_names: # Should already be true due to skip logic
                     f.write(f"try:\n")
                     f.write(f"    from .{module_name_part} import {class_name}\n")
                     generated_class_names.add(class_name)
                     f.write(f"except ImportError:\n")
                     f.write(f"    logger.warning(f'Could not import {class_name} from .{module_name_part}')\n")

            # 3. Define __all__
            all_names = sorted(list(base_type_names | generated_class_names))
            f.write("\n__all__ = [\n")
            for name in all_names:
                f.write(f'    "{name}",\n')
            f.write("]\n\n")

            # 4. Include rebuild_all function and call
            #    (Ensure the full rebuild_all definition is included here as before)
            f.write("# --- Rebuild models to resolve forward references ---\n")
            f.write("def rebuild_all() -> None:\n")
            # <<<< INSERT FULL rebuild_all function definition here >>>>
            f.write("    pass # Placeholder - MUST INSERT FULL DEFINITION\n\n")
            f.write("# Run rebuild automatically on import\n")
            f.write("try:\n")
            f.write("    rebuild_all()\n")
            f.write("    logger.info(f'Pydantic models in {__name__} package rebuilt.')\n")
            f.write("except Exception as e_global:\n")
            f.write("    logger.error(f'Global error during model rebuild: {e_global}', exc_info=True)\n")

        logging.info(f"Successfully generated __init__.py at {init_py_path}")

    except Exception as e:
        logging.error(f"Failed to write __init__.py: {e}", exc_info=True)

In [223]:
if __name__ == "__main__":
    logger.info("Starting Schema.org to Pydantic Conversion Process...")

    # Step 1: Parse Schema
    schema_graph = parse_schema_to_graph(SCHEMA_FILE, SCHEMA_FORMAT)
    if not schema_graph:
        logger.critical("Failed to parse schema graph. Exiting.")
        exit(1)

    # Step 2: Analyze Schema
    analyzed_schema = analyze_schema_graph(schema_graph)
    if not analyzed_schema or not analyzed_schema.get("classes"):
        logger.critical("Schema analysis failed or yielded no classes. Exiting.")
        exit(1)

    # Extract properties_info - needed by generate_ontology_models
    properties_info_main: Dict[rdflib.URIRef, PropertyInfo] = analyzed_schema.get("properties", {})
    if not properties_info_main:
        logger.warning("No property information found during analysis. Models may lack fields.")
        # Decide if this is critical - for now, proceed

    # Step 3 & 4: Generate Models (calls the modified functions)
    try:
         generate_ontology_models(
             analyzed_schema=analyzed_schema,
             output_base_dir=OUTPUT_DIR,
             models_subdir=MODELS_SUBDIR,
             # Pass the necessary arguments explicitly
             schema_graph=schema_graph,
             properties_info=properties_info_main
         )
         logger.info("Ontology generation process finished.")
         logger.info(f"Generated models located in: {pathlib.Path(OUTPUT_DIR) / MODELS_SUBDIR}")
         logger.info("Proceed with Step 5: Post-Processing & Verification (running formatters, linters, mypy).")
    except Exception as e:
         logger.critical(f"Ontology generation failed: {e}", exc_info=True)
         exit(1)

    # Step 5 would be run externally using the script/commands from previous steps
    # e.g., run the 'run_post_processing_pipeline()' function if defined in this script

INFO:__main__:Starting Schema.org to Pydantic Conversion Process...
INFO:root:Attempting to parse schema file: schema.txt (format: turtle)
INFO:root:Successfully parsed 8904 triples.
INFO:root:Found 628 potential schema.org classes.
INFO:root:Found 921 potential schema.org properties.
INFO:root:Analyzed 628 classes and 921 properties.
INFO:root:Generated base types file: output_ontology/models/base_types.py
INFO:root:Generating models in alphabetical order by URI.
INFO:root:Generated 625 specific Pydantic model files in output_ontology/models
INFO:root:Successfully generated __init__.py at output_ontology/models/__init__.py
INFO:__main__:Ontology generation process finished.
INFO:__main__:Generated models located in: output_ontology/models
INFO:__main__:Proceed with Step 5: Post-Processing & Verification (running formatters, linters, mypy).


# Ignore below this


In [193]:
# Assumes other imports and helper functions (map_*, get_metadata, get_module_path) are defined elsewhere
# Partially working
def generate_pydantic_model_code(
    class_info: ClassInfo,
    properties_info: Dict[rdflib.URIRef, PropertyInfo],
    all_class_names: Set[str], # All generated class names
    all_class_uris: Set[rdflib.URIRef], # All generated class URIs
    schema_graph: rdflib.Graph # Pass the graph object here
) -> str:
    """Generates the Python code string for a single Pydantic model, including robust imports and correct base classes."""

    class_name = map_uri_to_classname(class_info.uri)

    # --- *** CORRECTED: Call get_base_classes to determine inheritance *** ---
    valid_base_names_list = get_base_classes(class_info, all_class_names, all_class_uris, schema_graph)
    # --- *** END CORRECTION *** ---

    # --- Import Handling Setup ---
    typing_imports_needed = set()
    pydantic_imports_needed = set()
    datetime_imports_needed = set()
    other_stdlib_imports = set()
    cross_module_imports = set() # For TYPE_CHECKING block (forward refs in hints)
    runtime_base_class_imports = set() # For class definition line
    rich_type_imports = set() # For base_types like Quantity

    # Determine base class string and imports needed for bases
    if not valid_base_names_list or valid_base_names_list == ["BaseModel"]:
        base_class_str = "BaseModel"
        pydantic_imports_needed.add("BaseModel") # Explicitly needed if direct base
    else:
        base_class_str = ", ".join(valid_base_names_list)
        # Add imports needed for the base classes themselves at runtime
        for base_name in valid_base_names_list:
             # Only add import if it's not BaseModel (which comes from pydantic)
             if base_name != "BaseModel":
                  # Ensure we don't try to import the class defining itself as a base
                  if base_name != class_name:
                       runtime_base_class_imports.add(f"from .{get_module_path_for_class(base_name)} import {base_name}")
                  else:
                       logger.warning(f"Class {class_name} listed itself as a base? Skipping runtime import.")


    # --- Analyze Fields (Populate field_definitions and track imports) ---
    # (This part remains the same as the previous working version that fixed F401 for TYPE_CHECKING)
    field_definitions = []
    field_added = False

    sorted_property_uris = sorted(list(class_info.properties))

    for prop_uri in sorted_property_uris:
        if prop_uri not in properties_info: continue

        prop_info = properties_info[prop_uri]
        field_name = map_uri_to_fieldname(prop_info.uri)
        # Pass all_class_names for forward ref detection during hint generation
        # Also pass base_type_names now (assuming defined globally or passed)
        global base_type_names # Or pass it in
        type_hint_str = map_range_to_typehint(prop_info.ranges, all_class_names, base_type_names)

        # Clean prefixes and track actual usage for imports
        final_type_hint = type_hint_str
        # (Logic for populating typing_imports_needed, pydantic_imports_needed, etc.)
        if "Optional" in final_type_hint: typing_imports_needed.add("Optional")
        if "List" in final_type_hint: typing_imports_needed.add("List")
        if "Union" in final_type_hint: typing_imports_needed.add("Union")
        if "Any" in final_type_hint: typing_imports_needed.add("Any")
        if "date" in final_type_hint: datetime_imports_needed.add("date")
        if "datetime" in final_type_hint: datetime_imports_needed.add("datetime")
        # ... etc for other types ...
        if "Quantity" in final_type_hint: rich_type_imports.add("Quantity")
        # ... etc for rich types ...
        if "AnyUrl" in final_type_hint: pydantic_imports_needed.add("AnyUrl")
        if "EmailStr" in final_type_hint: pydantic_imports_needed.add("EmailStr")


        # Track cross-module imports needed for forward reference hints ('ClassName')
        potential_classes_in_hint = set(re.findall(r"'(\w+)'", final_type_hint))
        for potential_class in potential_classes_in_hint:
             # Add if it's a known class, not the current one, and not already a direct base
             if potential_class in all_class_names and potential_class != class_name and potential_class not in valid_base_names_list:
                 cross_module_imports.add(potential_class)

        # Generate Field(...) call
        field_args_dict = get_field_metadata(prop_info)
        field_args_parts = [repr(field_args_dict.pop('default', None))]
        field_args_parts.extend(f"{k}={repr(v)}" for k, v in field_args_dict.items())
        field_call = f"Field({', '.join(field_args_parts)})"

        field_definitions.append(f"    {field_name}: {final_type_hint} = {field_call}")
        field_added = True

    if field_added: pydantic_imports_needed.add("Field") # Add Field if needed


    # --- Assemble Code (using the logic from the fix for F401 TYPE_CHECKING) ---
    code_parts = ["from __future__ import annotations"]

    # Add standard library imports
    if datetime_imports_needed:
        code_parts.append(f"from datetime import {', '.join(sorted(list(datetime_imports_needed)))}")
    code_parts.extend(sorted(list(other_stdlib_imports))) # Like 'import decimal'

    # Add typing imports (conditionally add TYPE_CHECKING)
    typehint_only_imports = cross_module_imports # Direct bases are imported via runtime_base_class_imports now
    needs_type_checking_block = bool(typehint_only_imports)
    if needs_type_checking_block:
        typing_imports_needed.add("TYPE_CHECKING")
    if typing_imports_needed:
        code_parts.append(f"from typing import {', '.join(sorted(list(typing_imports_needed)))}")

    # Add Pydantic imports
    if pydantic_imports_needed:
        code_parts.append(f"from pydantic import {', '.join(sorted(list(pydantic_imports_needed)))}")

    # Add Rich type imports from base_types
    if rich_type_imports:
         # Ensure correct relative path if base_types is in the same dir
         code_parts.append(f"from .base_types import {', '.join(sorted(list(rich_type_imports)))}")

    # Add Runtime base class imports (already formatted with 'from .module import Class')
    code_parts.extend(sorted(list(runtime_base_class_imports)))

    # Generate TYPE_CHECKING block ONLY if needed
    if needs_type_checking_block:
        code_parts.append("\nif TYPE_CHECKING:")
        for class_to_import in sorted(list(typehint_only_imports)):
            module_path = get_module_path_for_class(class_to_import)
            if class_to_import != class_name: # Avoid self-import in block
                 code_parts.append(f"    from {module_path} import {class_to_import}")

    # Class definition
    code_parts.append("\n")
    class_docstring = f'"""\n    {class_name}: {textwrap.shorten(class_info.comment or "No description provided.", width=70)}\n\n    Generated from: {class_info.uri}\n    """'
    # Use the calculated base_class_str which has pruned bases
    code_parts.append(f"class {class_name}({base_class_str}):")
    code_parts.append(f"    {class_docstring}")

    if not field_definitions:
        code_parts.append("    pass")
    else:
        code_parts.extend(field_definitions)

    code_parts.append("\n    model_config = {'extra': 'forbid'}")

    return "\n".join(code_parts) + "\n"

In [None]:
# Parially working - DO NOT EXECUTE
if __name__ == "__main__":
    logger.info("Starting Schema.org to Pydantic Conversion Process...")

    # Step 1: Parse Schema
    schema_graph = parse_schema_to_graph(SCHEMA_FILE, SCHEMA_FORMAT)
    if not schema_graph:
        logger.critical("Failed to parse schema graph. Exiting.")
        exit(1)

    # Step 2: Analyze Schema
    # Make sure analyze_schema_graph is defined as in previous steps
    analyzed_schema = analyze_schema_graph(schema_graph)
    if not analyzed_schema or not analyzed_schema.get("classes"):
        logger.critical("Schema analysis failed or yielded no classes. Exiting.")
        exit(1)

    # Extract info needed by generator functions
    classes_info: Dict[rdflib.URIRef, ClassInfo] = analyzed_schema.get("classes", {})
    properties_info: Dict[rdflib.URIRef, PropertyInfo] = analyzed_schema.get("properties", {}) # Make sure properties_info is accessible globally or passed correctly
    all_class_names = {map_uri_to_classname(uri) for uri in classes_info.keys()}
    all_class_uris = set(classes_info.keys())
    # Define base type URIs and names (ensure consistency with BASE_TYPES_CODE)
    base_type_uris = {
        SCHEMA.Quantity, SCHEMA.Distance, SCHEMA.Duration, SCHEMA.DefinedTerm, SCHEMA.Money
    }
    base_type_names = {map_uri_to_classname(uri) for uri in base_type_uris if uri in classes_info}
    all_class_names.update(base_type_names) # Ensure base types are in the registry

    # Step 3 & 4: Generate Models (including base_types.py and __init__.py)
    # Ensure generate_ontology_models definition is available
    # It internally calls generate_pydantic_model_code which now takes extra args
    # We need to adjust generate_ontology_models to pass them

    # --- Adjusted generate_ontology_models definition ---
    # (Include the full function from the previous correct response, ensuring it
    # passes 'all_class_uris' and 'schema_graph' to generate_pydantic_model_code
    # and uses the correct 'module_to_class_map' for __init__.py generation)

    # Example call - assuming generate_ontology_models is defined correctly above
    try:
         generate_ontology_models(
             analyzed_schema=analyzed_schema,
             output_base_dir=OUTPUT_DIR,
             models_subdir=MODELS_SUBDIR
             # Pass schema_graph and all_class_uris if needed by the internal call
             # This depends on how you structure the final script
             # Option 1: Make properties_info, all_class_names, all_class_uris, schema_graph global
             # Option 2: Pass them through generate_ontology_models down to generate_pydantic_model_code
         )
         logger.info("Ontology generation process finished.")
         logger.info(f"Generated models located in: {pathlib.Path(OUTPUT_DIR) / MODELS_SUBDIR}")
         logger.info("Proceed with Step 5: Post-Processing & Verification (running formatters, linters, mypy).")
    except Exception as e:
         logger.critical(f"Ontology generation failed: {e}", exc_info=True)
         exit(1)

    # Step 5 would be run externally using the script/commands from previous steps

In [224]:
def generate_pydantic_model_code(
    class_info: ClassInfo,
    properties_info: Dict[rdflib.URIRef, PropertyInfo],
    all_class_names: Set[str] # All class names being generated
) -> str:
    """Generates the Python code string for a single Pydantic model, including robust imports."""

    class_name = map_uri_to_classname(class_info.uri)
    
    base_uris = class_info.superclasses
    potential_base_names = {map_uri_to_classname(uri) for uri in base_uris}
    valid_base_names = sorted([name for name in potential_base_names if name in all_class_names and name != 'Thing'])

    # --- START: Revised Import Handling ---
    # Use sets to automatically handle duplicates
    core_imports = set()
    typing_imports_specific = set() # Specific items needed from typing
    pydantic_imports_specific = set() # Always need these
    datetime_imports_specific = set()
    other_imports = set() # For things like decimal, isodate
    cross_module_imports = set() # For other generated models (used in TYPE_CHECKING)
    rich_type_imports = set()
    runtime_base_class_imports = set() 

    # Determine base class string and imports needed for bases
    if not valid_base_names:
        base_class_str = "BaseModel"
        pydantic_imports_specific.add("BaseModel")
    else:
        base_class_str = ", ".join(valid_base_names)
        for base_name in valid_base_names:
             # Assume base classes are in other modules within the same package
            cross_module_imports.add(base_name)
            # runtime_base_class_imports.add(f"from .{get_module_path_for_class(base_name)} import {base_name}")

    # Analyze fields to determine necessary imports
    field_definitions = []
    sorted_property_uris = sorted(list(class_info.properties))
    field_added = False

    for prop_uri in sorted_property_uris:
        if prop_uri not in properties_info: continue # Skip if property info missing

        prop_info = properties_info[prop_uri]
        field_name = map_uri_to_fieldname(prop_info.uri)
        type_hint_str = map_range_to_typehint(prop_info.ranges, all_class_names)

        # --- Refined Import Tracking based on Type Hint String ---
        if "Optional" in type_hint_str: typing_imports_specific.add("Optional")
        if "List" in type_hint_str: typing_imports_specific.add("List")
        if "Union" in type_hint_str: typing_imports_specific.add("Union")
        if "Any" in type_hint_str: typing_imports_specific.add("Any")
        # Add VC-Zero/rich types if they come from specific modules
        if "datetime.date" in type_hint_str: datetime_imports_specific.add("date")
        if "datetime.datetime" in type_hint_str: datetime_imports_specific.add("datetime")
        if "datetime.time" in type_hint_str: datetime_imports_specific.add("time")
        if "timedelta" in type_hint_str: datetime_imports_specific.add("timedelta")
        if "decimal.Decimal" in type_hint_str: other_imports.add("import decimal")
        # Add imports for Pydantic types used
        if "pydantic.AnyUrl" in type_hint_str: pydantic_imports_specific.add("AnyUrl")
        if "pydantic.EmailStr" in type_hint_str: pydantic_imports_specific.add("EmailStr")
        # Add other specific pydantic types as needed based on TYPE_MAP
        if "Quantity" in type_hint_str: rich_type_imports.add("Quantity")
        if "Distance" in type_hint_str: rich_type_imports.add("Distance")
        if "Duration" in type_hint_str: rich_type_imports.add("Duration")
        if "DefinedTerm" in type_hint_str: rich_type_imports.add("DefinedTerm")

        # Track cross-module imports needed for type hints (used within TYPE_CHECKING)
        potential_classes_in_hint = set(re.findall(r"'(\w+)'", type_hint_str))
        for potential_class in potential_classes_in_hint:
             if potential_class in all_class_names and potential_class != class_name:
                 cross_module_imports.add(potential_class)
        # --- End Refined Import Tracking ---

        field_args_dict = get_field_metadata(prop_info)
        # Generate Field(...) call (same as before)
        field_args_parts = []
        default_val_repr = repr(field_args_dict.pop('default', None))
        field_args_parts.append(default_val_repr)
        field_args_parts.extend(f"{k}={repr(v)}" for k, v in field_args_dict.items())
        field_call = f"Field({', '.join(field_args_parts)})"

        # *** CRITICAL FIX: Ensure type hints use the *imported names*, not module prefixes ***
        # Basic replacement - more robust parsing might be needed for complex nested hints
        final_type_hint = type_hint_str.replace("pydantic.", "") # Remove prefix if imported directly
        final_type_hint = final_type_hint.replace("datetime.", "") # Remove prefix if imported directly
        # Replace typing prefixes only if specific types are imported
        if typing_imports_specific:
            final_type_hint = final_type_hint.replace("typing.", "")

        field_definitions.append(f"    {field_name}: {final_type_hint} = {field_call}")
        field_added = True

    # --- DEBUG LOGGING START ---
    # print(f"--- Debugging Imports for Class: {class_name} ---")
    # print(f"Field added flag: {field_added}")
    # print(f"Cross-module imports needed (before filtering bases): {cross_module_imports}")

    # Filter out base classes already imported at runtime from type-hint-only imports
    typehint_only_imports = cross_module_imports - set(valid_base_names)
    # print(f"Type-hint-only forward refs: {typehint_only_imports}")

    will_generate_type_checking_block = bool(typehint_only_imports)

    if will_generate_type_checking_block:
        typing_imports_specific.add("TYPE_CHECKING") # Only add it if the block will exist
    # print(f"Will generate TYPE_CHECKING block? {will_generate_type_checking_block}")




    # --- Assemble the full class code with Corrected Imports ---
    code_parts = []
    code_parts.append("from __future__ import annotations") # Keep this first

    # Add standard library imports
    if datetime_imports_specific:
        code_parts.append(f"from datetime import {', '.join(sorted(list(datetime_imports_specific)))}")
    code_parts.extend(sorted(list(other_imports))) # Like 'import decimal'

    

    # Generate the main typing import line (if needed)
    if typing_imports_specific:
        code_parts.append(f"from typing import {', '.join(sorted(list(typing_imports_specific)))}")
    

    # Add typing imports - always import TYPE_CHECKING
    # typing_imports_specific.add("TYPE_CHECKING")
    # code_parts.append(f"from typing import {', '.join(sorted(list(typing_imports_specific)))}")

    # Add Pydantic imports
    if field_added:
        pydantic_imports_specific.add("Field")
    if pydantic_imports_specific:
        code_parts.append(f"from pydantic import {', '.join(sorted(list(pydantic_imports_specific)))}")
    logging.debug(f"Final pydantic imports needed: {pydantic_imports_specific}")

    code_parts.extend(sorted(list(runtime_base_class_imports)))

    if rich_type_imports:
         code_parts.append(f"from .base_types import {', '.join(sorted(list(rich_type_imports)))}")


    # Add forward reference imports within TYPE_CHECKING block
    # Only include classes that are *not* base classes (already imported above if needed)
    typehint_only_imports = cross_module_imports - set(valid_base_names)
    if typehint_only_imports:
        # code_parts.append("\nif TYPE_CHECKING:")
        for class_to_import in sorted(list(typehint_only_imports)):
            code_parts.append(f"from {get_module_path_for_class(class_to_import)} import {class_to_import}")

    # **** MODIFICATION HERE ****
    # Conditionally add 'Field' to pydantic imports only if used
    
    # Only add the import line if the set is not empty
    
    # **** END MODIFICATION ****

    # Add imports for base classes (needed at runtime for class definition)
    # Place these *after* TYPE_CHECKING block if they are only needed for inheritance
    # If base classes are needed for type hints *within this file*, they need careful handling
    # For now, let's assume they are correctly imported for the class definition line.
    # This might still need adjustment based on specific inheritance patterns.
    if valid_base_names:
        # Ensure base classes are imported for the class definition line
        # This might duplicate imports already added above but ensures availability
        for base_name in valid_base_names:
            runtime_base_class_imports.add(f"from {get_module_path_for_class(base_name)} import {base_name}")
    
    code_parts.extend(sorted(list(runtime_base_class_imports)))
    code_parts.append("\n") # Separator

    # Add class definition
    class_docstring = f'"""\n    {class_name}: {textwrap.shorten(class_info.comment or "No description provided.", width=70)}\n\n    Generated from: {class_info.uri}\n    """'
    code_parts.append(f"class {class_name}({base_class_str}):")
    code_parts.append(f"    {class_docstring}")

    if not field_definitions:
        code_parts.append("    pass")
    else:
        code_parts.extend(field_definitions)

    code_parts.append("\n    model_config = {'extra': 'forbid'}")

    return "\n".join(code_parts) + "\n"


In [225]:
BASE_TYPES_CODE = """
from __future__ import annotations # Keep first
from pydantic import (
    BaseModel, Field, AnyUrl, field_validator,
    model_validator, condecimal, constr, EmailStr # Added EmailStr just in case, adjust as needed
)
from typing import Optional, List, Union, Any
from datetime import date, datetime, time, timedelta
import decimal
import isodate # Requires: pip install isodate
import logging

# Configure basic logging if needed within this module too
# logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) # Use logger for warnings

class Quantity(BaseModel):
    \"\"\"
    Base model for quantitative values based on schema.org/Quantity.
    Actual value and unit are often in subclasses or specific properties.
    This primarily serves as a conceptual base.
    \"\"\"
    model_config = {'extra': 'allow'} # Allow extra fields as Quantity is generic

class Distance(Quantity):
    \"\"\"
    Represents a distance based on schema.org/Distance.
    Uses value and unit representation common in QuantitativeValue.
    \"\"\"
    # Based on properties commonly used with QuantitativeValue for distance
    value: Optional[float] = Field(None, description="The numerical value of the distance.")
    unitCode: Optional[str] = Field(None, description="UN/CEFACT Common Code (3 characters) or URL for the unit of measurement. E.g., 'MTR' for meter, 'KM' for kilometer, 'FT' for foot, 'INH' for inch.")
    unitText: Optional[str] = Field(None, description="A string indicating the unit of measurement. Useful if unitCode is not applicable or needs clarification. E.g., 'meters', 'miles'.")

    model_config = {'extra': 'forbid'}

    # Add validation if needed, e.g., check unitCode format

class Duration(Quantity):
    \"\"\"
    Represents a duration based on schema.org/Duration.
    Stores duration as datetime.timedelta, parsed from ISO 8601 duration format.
    \"\"\"
    # Pydantic doesn't have native ISO 8601 duration parsing, use validator
    # Use alias to allow input using schema.org's likely property name if it differs
    value_iso8601: Optional[str] = Field(None, validation_alias='iso8601Duration', serialization_alias='iso8601Duration', description="Duration in ISO 8601 format (e.g., P1Y2M3DT4H5M6S).")
    value_timedelta: Optional[timedelta] = Field(None, exclude=True, description="Parsed timedelta value (internal).") # Exclude from standard model dump

    model_config = {'extra': 'forbid', 'populate_by_name': True} # Allow using alias on input

    @model_validator(mode='before')
    @classmethod
    def parse_duration(cls, data: Any) -> Any:
        if isinstance(data, dict):
            iso_duration_str = data.get("value_iso8601") or data.get("iso8601Duration")
            # Parse only if timedelta isn't already provided and string exists
            if iso_duration_str and isinstance(iso_duration_str, str) and 'value_timedelta' not in data:
                try:
                    td = isodate.parse_duration(iso_duration_str)
                    data['value_timedelta'] = td
                    data['value_iso8601'] = iso_duration_str # Ensure original is stored
                except (isodate.ISO8601Error, ValueError) as e:
                    logger.warning(f"Could not parse ISO 8601 duration '{iso_duration_str}': {e}")
                    data['value_timedelta'] = None
                    data['value_iso8601'] = iso_duration_str # Keep original invalid string
            # If timedelta provided directly, ensure value_timedelta field is populated
            elif data.get('value_timedelta') and isinstance(data.get('value_timedelta'), timedelta):
                 pass # Already populated
        elif isinstance(data, str):
             # Allow direct initialization from ISO string
             try:
                 td = isodate.parse_duration(data)
                 return {'value_iso8601': data, 'value_timedelta': td}
             except (isodate.ISO8601Error, ValueError) as e:
                 logger.warning(f"Could not parse ISO 8601 duration string '{data}': {e}")
                 return {'value_iso8601': data, 'value_timedelta': None}

        return data # Return dict for Pydantic processing

    # Optional: Add property to access timedelta easily
    @property
    def timedelta(self) -> Optional[timedelta]:
        return self.value_timedelta

    def __str__(self) -> str:
        \"\"\"Return ISO 8601 string representation if available.\"\"\"
        # Prefer original string if available, otherwise format timedelta (basic)
        if self.value_iso8601:
            return self.value_iso8601
        elif self.value_timedelta is not None:
             try:
                 # Attempt basic formatting back (might lose fidelity vs isodate.duration_isoformat)
                 return str(self.value_timedelta)
             except Exception:
                 return "Invalid Duration Timedelta"
        return "Invalid/Missing Duration"


class DefinedTerm(BaseModel):
    \"\"\"
    Represents a term from a defined set based on schema.org/DefinedTerm.
    \"\"\"
    # Core properties often associated with DefinedTerm
    termCode: Optional[str] = Field(None, description="A code that identifies this DefinedTerm within a DefinedTermSet.")
    name: Optional[str] = Field(None, description="The name of the item.")
    description: Optional[str] = Field(None, description="A description of the item.")
    # Allow referencing the set it belongs to, if known (using AnyUrl for flexibility)
    inDefinedTermSet: Optional[AnyUrl] = Field(None, description="A DefinedTermSet Organization or DataCatalog that contains this term.")

    model_config = {'extra': 'allow'} # Allow potential other properties from schema.org or extensions

class Money(BaseModel):
     \"\"\"
     Represents an amount of money with a currency. Based on schema.org concepts
     often used with PriceSpecification or MonetaryAmount.
     \"\"\"
     # Using 'amount' and 'currency' inspired by common patterns, not a direct schema.org/Money type
     amount: Optional[decimal.Decimal] = Field(None, description="The amount of money.")
     currency: Optional[constr(pattern=r'^[A-Z]{3}$')] = Field(None, description="ISO 4217 Currency Code") # type: ignore

     @field_validator('amount', mode='before')
     @classmethod
     def clean_amount(cls, v: Any) -> Optional[decimal.Decimal]: # Added type hints
         # Indentation Level 2 (Inside Function)
         if isinstance(v, (int, float)):
             try: # Indentation Level 3
                 return decimal.Decimal(v) # Indentation Level 4
             except Exception as e: # Indentation Level 3
                  logger.error(f"Error converting {v} to Decimal: {e}")
                  raise ValueError(f"Cannot convert {v} to Decimal") # Indentation Level 4
         if isinstance(v, str): # Indentation Level 2
             try: # Indentation Level 3
                 return decimal.Decimal(v.strip()) # Indentation Level 4
             except decimal.InvalidOperation: # Indentation Level 3
                  raise ValueError(f"Invalid decimal format for amount: {v}") # Indentation Level 4
         # Allow existing Decimals or None to pass through
         if isinstance(v, decimal.Decimal) or v is None: # Indentation Level 2
              return v # Indentation Level 3
         # Raise error for other unexpected types
         raise ValueError(f"Unexpected type for amount: {type(v)}") # Indentation Level 2

     model_config = {'extra': 'forbid'}

# Add other base types/VC-Zeros below if needed

"""

In [226]:
import rdflib
from rdflib.namespace import RDF, RDFS, OWL, XSD
from typing import List, Set, Dict, Optional, NamedTuple, Union, Any, cast # Ensure necessary types for function signature
import logging
import os
import pathlib
import shlex # If used by helper functions, ensure imported
import textwrap # If used by helper functions

# --- Assume previous code exists: ---
# SCHEMA namespace
# ClassInfo, PropertyInfo NamedTuples
# analyzed_schema dict (passed as argument)
# properties_info dict (extracted from analyzed_schema)
# BASE_TYPES_CODE string (containing Quantity, Distance, Duration, DefinedTerm, Money)
# Mapping functions: map_uri_to_classname, map_uri_to_fieldname, get_module_path_for_class
# generate_pydantic_model_code function (from latest correct version)
# --------------------------------------

logger = logging.getLogger(__name__) # Define logger for use within function

def generate_ontology_models(
     analyzed_schema: Dict[str, Dict],
     output_base_dir: str,
     models_subdir: str = "models"
     ) -> None: # Added return type hint
    """
    Generates Pydantic model files from analyzed schema info, including base_types.py
    and a correctly structured __init__.py.
    """

    classes_info: Dict[rdflib.URIRef, ClassInfo] = analyzed_schema.get("classes", {})
    properties_info: Dict[rdflib.URIRef, PropertyInfo] = analyzed_schema.get("properties", {})

    if not classes_info: # Check classes_info primarily
        logging.error("No class information found in analyzed schema. Cannot generate models.")
        return
    if not properties_info:
         logging.warning("No property information found in analyzed schema. Models may lack fields.")
         # Proceed cautiously, or return depending on desired strictness

    output_path = pathlib.Path(output_base_dir) / models_subdir
    output_path.mkdir(parents=True, exist_ok=True)
    # Ensure main __init__.py exists before generating submodules
    (output_path / "__init__.py").touch()

    # Define URIs and names for types handled ONLY in base_types.py
    base_type_uris = {
        SCHEMA.Quantity, SCHEMA.Distance, SCHEMA.Duration, SCHEMA.DefinedTerm,
        # Add SCHEMA.Money if defined in BASE_TYPES_CODE
        SCHEMA.Money
    }
    base_type_names = {map_uri_to_classname(uri) for uri in base_type_uris if uri in classes_info} # Get names only for types actually present

    # --- Generate base_types.py ---
    base_types_path = output_path / "base_types.py"
    try:
        # Assuming BASE_TYPES_CODE string is defined globally or passed in
        global BASE_TYPES_CODE
        if 'BASE_TYPES_CODE' not in globals():
             raise NameError("BASE_TYPES_CODE string not found.")

        with open(base_types_path, "w", encoding="utf-8") as f:
            f.write(BASE_TYPES_CODE)
        logging.info(f"Generated base types file: {base_types_path}")
    except Exception as e:
        logging.error(f"Failed to write base_types.py: {e}", exc_info=True)
        return # Stop if base types cannot be written

    all_class_names = {map_uri_to_classname(uri) for uri in classes_info.keys()}
    # Add base type names to the known class registry for type hinting purposes
    all_class_names.update(base_type_names)

    # --- (Optional but recommended) Topological Sort for Generation Order ---
    # Using alphabetical as fallback, assuming TYPE_CHECKING handles most cycles
    generation_order_uris = sorted(list(classes_info.keys()))
    logging.info("Generating models in alphabetical order by URI.")
    # Add more robust sorting here if needed based on hierarchy


    # --- Generate individual model files, SKIPPING base types ---
    generated_files = 0
    # **** Store mapping from module name part to main class name ****
    module_to_class_map: Dict[str, str] = {}

    for class_uri in generation_order_uris:
        if class_uri not in classes_info: continue

        # **** CORRECTED SKIP LOGIC ****
        if class_uri in base_type_uris:
            logging.debug(f"Skipping individual file generation for {class_uri} (defined in base_types.py)")
            continue
        # **** END CORRECTION ****

        class_info = classes_info[class_uri]
        class_name = map_uri_to_classname(class_uri)
        module_name_part = map_uri_to_fieldname(class_uri) # snake_case name for file/module

        try:
            model_code = generate_pydantic_model_code(class_info, properties_info, all_class_names)
            file_path = output_path / f"{module_name_part}.py"
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(model_code)

            # **** Store mapping IF generation succeeded ****
            module_to_class_map[module_name_part] = class_name
            generated_files += 1

        except Exception as e:
            logging.error(f"Failed to generate code for class {class_name} ({class_uri}): {e}", exc_info=True) # Add traceback

    logging.info(f"Generated {generated_files} specific Pydantic model files in {output_path}")


    # --- *** CORRECTED __init__.py Generation *** ---
    init_py_path = output_path / "__init__.py"
    try:
        with open(init_py_path, "w", encoding="utf-8") as f:
            f.write("# flake8: noqa\n")
            f.write("# Auto-generated __init__.py for ontology models\n\n")
            f.write("import logging\n")
            f.write("import importlib\n")
            f.write("import pkgutil\n")
            f.write("from typing import TYPE_CHECKING # TYPE_CHECKING might be needed by rebuild_all\n")
            f.write("from pydantic import BaseModel # Needed for rebuild_all check\n\n")

            f.write("logger: logging.Logger = logging.getLogger(__name__)\n\n") # Define early

            # 1. Import explicitly from base_types
            if base_type_names:
                f.write("# --- Import Base Types ---\n")
                f.write("try:\n")
                f.write(f"    from .base_types import ({', '.join(sorted(list(base_type_names)))})\n") # Explicit names
                f.write("except ImportError:\n")
                f.write("    logger.warning('Could not import base_types')\n\n")

            # 2. Import explicitly from all OTHER generated modules
            if module_to_class_map:
                 f.write("# --- Import Generated Models ---\n")
            generated_class_names = set()
            for module_name_part in sorted(module_to_class_map.keys()):
                class_name = module_to_class_map[module_name_part]
                # Ensure we don't try to re-import base types if map is somehow wrong
                if class_name not in base_type_names:
                    f.write(f"try:\n")
                    f.write(f"    from .{module_name_part} import {class_name}\n")
                    generated_class_names.add(class_name)
                    f.write(f"except ImportError:\n")
                    f.write(f"    logger.warning(f'Could not import {class_name} from .{module_name_part}')\n")

            # 3. Define __all__ for cleaner namespace
            all_names = sorted(list(base_type_names | generated_class_names))
            f.write("\n__all__ = [\n")
            for name in all_names:
                f.write(f'    "{name}",\n') # Use double quotes for names in list
            f.write("]\n\n")


            # 4. Include rebuild_all function and call (with fixed annotation)
            f.write("# --- Rebuild models to resolve forward references ---\n")
            f.write("def rebuild_all() -> None:\n") # Ensure annotation is present
            f.write("    package_name = __name__\n")
            f.write("    package = importlib.import_module(package_name)\n")
            f.write("    rebuilt_models = set()\n")
            f.write('    logger.debug(f"Attempting rebuild in {package_name}")\n\n')
            f.write("    for _, module_name_part, _ in pkgutil.iter_modules(package.__path__, package_name + '.') :\n")
            f.write("        # Skip rebuild attempt on __init__ itself or base_types if desired\n")
            f.write("        if module_name_part.endswith('.__init__') or module_name_part.endswith('.base_types'):\n")
            f.write("            continue\n")
            f.write("        try:\n")
            f.write("            module = importlib.import_module(module_name_part)\n")
            f.write("            for attribute_name in dir(module):\n")
            f.write("                attribute = getattr(module, attribute_name)\n")
            f.write("                if (isinstance(attribute, type) and\n")
            f.write("                        issubclass(attribute, BaseModel) and\n") # Check issubclass safely
            f.write("                        attribute is not BaseModel and\n")
            f.write("                        hasattr(attribute, 'model_rebuild') and\n") # Check if it has the method
            f.write("                        attribute not in rebuilt_models):\n")
            f.write("                    try:\n")
            f.write('                        logger.debug(f"Rebuilding: {attribute.__name__}")\n')
            f.write("                        attribute.model_rebuild(force=True)\n")
            f.write("                        rebuilt_models.add(attribute)\n")
            f.write("                    except Exception as e_rebuild:\n")
            f.write("                        logger.error(f'Error rebuilding model {attribute.__name__} in {module_name_part}: {e_rebuild}', exc_info=False)\n")
            f.write("        except ModuleNotFoundError:\n")
            f.write("            logger.warning(f\"Module not found during rebuild: {module_name_part}\")\n")
            f.write("        except Exception as e_import:\n")
            f.write("             logger.error(f'Error importing module {module_name_part} during rebuild: {e_import}', exc_info=False)\n\n")

            f.write("# Run rebuild automatically on import\n")
            f.write("try:\n")
            f.write("    rebuild_all()\n")
            f.write("    logger.info(f'Pydantic models in {__name__} package rebuilt.')\n")
            f.write("except Exception as e_global:\n")
            f.write("    logger.error(f'Global error during model rebuild: {e_global}', exc_info=True)\n")

        logging.info(f"Successfully generated __init__.py at {init_py_path}")

    except Exception as e:
        logging.error(f"Failed to write __init__.py: {e}", exc_info=True)


# --- Main execution block (Example) ---
# if __name__ == "__main__":
#     # Assuming schema_graph and analyzed_schema are loaded/created
#     # schema_graph = parse_schema_to_graph(...)
#     # analyzed_schema = analyze_schema_graph(schema_graph)
#     if analyzed_schema and analyzed_schema.get("classes"):
#         generate_ontology_models(analyzed_schema, "output_ontology", "models")
#         logging.info("Ontology generation process finished.")
#     else:
#          logging.error("Analysis data missing, cannot generate models.")

In [187]:
# # --- generate_ontology_models function (Mostly Unchanged, calls revised generator) ---

# def generate_ontology_models(
#      analyzed_schema: Dict[str, Dict],
#      output_base_dir: str,
#      models_subdir: str = "models"
#      ):
#     """Generates Pydantic model files from analyzed schema info."""

#     classes_info: Dict[rdflib.URIRef, ClassInfo] = analyzed_schema.get("classes", {})
#     properties_info: Dict[rdflib.URIRef, PropertyInfo] = analyzed_schema.get("properties", {})

#     if not classes_info or not properties_info:
#         logging.error("No class or property information found in analyzed schema.")
#         return

#     # **** NEW STEP: Generate base_types.py ****
#     output_path = pathlib.Path(output_base_dir) / models_subdir
#     output_path.mkdir(parents=True, exist_ok=True)
#     (output_path / "__init__.py").touch() # Ensure __init__ exists

#     base_types_path = output_path / "base_types.py"
#     base_type_uris = {
#         SCHEMA.Quantity, SCHEMA.Distance, SCHEMA.Duration, SCHEMA.DefinedTerm,
#         # Add SCHEMA.Money if defined there
#     }
#     try:
#         with open(base_types_path, "w", encoding="utf-8") as f:
#             f.write(BASE_TYPES_CODE)
#         logging.info(f"Generated base types file: {base_types_path}")
#     except Exception as e:
#         logging.error(f"Failed to write base_types.py: {e}")
#         return # Stop if base types cannot be written
#     # **** END NEW STEP ****

#     # Map URIs to their class names for import generation
#     base_type_names = {map_uri_to_classname(uri) for uri in base_type_uris}

#     # --- Generate individual model files ---
#     generated_files = 0


#     all_class_names = {map_uri_to_classname(uri) for uri in classes_info.keys()}

#     # Basic topological sort preparation (dependencies based on superclasses)
#     dependencies = {
#         map_uri_to_classname(cls_uri): {map_uri_to_classname(sup_uri)
#                                         for sup_uri in info.superclasses
#                                         if sup_uri in classes_info} # Only consider superclasses we generate
#         for cls_uri, info in classes_info.items()
#     }

#     generation_order = []
#     visited = set()
#     visiting = set()

#     def visit(class_name):
#         """Helper for topological sort."""
#         if class_name not in dependencies: return # Might be external like BaseModel
#         visited.add(class_name)
#         visiting.add(class_name)
#         for dep in dependencies.get(class_name, set()):
#             if dep in visiting:
#                 # Cycle detected - handle appropriately (e.g., log, break, complex resolution)
#                 logging.warning(f"Potential circular dependency involving {class_name} and {dep}. Relying on forward refs.")
#             if dep not in visited:
#                 visit(dep)
#         visiting.remove(class_name)
#         generation_order.append(map_uri_to_classname(rdflib.URIRef(f"{SCHEMA}{class_name}"))) # Reconstruct URI crudely - needs improvement


#     # Determine generation order (simple approach, might need refinement for complex cases)
#     all_class_names_sorted_list = sorted(list(all_class_names)) # Fallback if sort fails
#     try:
#          # Attempt topological sort (may fail on cycles)
#          for class_name in all_class_names_sorted_list:
#              if class_name not in visited:
#                  visit(class_name)
#          generation_order_uris = [rdflib.URIRef(f"{SCHEMA}{name}") for name in generation_order]
#          logging.info("Generated class order based on inheritance.")
#     except Exception as e:
#          logging.warning(f"Topological sort for generation order failed ({e}). Using alphabetical.")
#          generation_order_uris = sorted(list(classes_info.keys()))


#     # Generate files
#     generated_files = 0
#     all_generated_modules = set()
#     for class_uri in generation_order_uris:
#          if class_uri not in classes_info: continue # Skip if URI wasn't mapped back correctly or is external

#          class_info = classes_info[class_uri]
#          class_name = map_uri_to_classname(class_uri)
#          module_name_part = map_uri_to_fieldname(class_uri)
#          all_generated_modules.add(module_name_part) # Track generated module names

#          try:
#              model_code = generate_pydantic_model_code(class_info, properties_info, all_class_names)
#              file_path = output_path / f"{module_name_part}.py"
#              with open(file_path, "w", encoding="utf-8") as f:
#                  f.write(model_code)
#              generated_files += 1
#          except Exception as e:
#              logging.error(f"Failed to generate code for class {class_name} ({class_uri}): {e}")

#     logging.info(f"Generated {generated_files} Pydantic model files in {output_path}")

#     # --- Update __init__.py to import all generated models and call model_rebuild ---
#     init_py_path = output_path / "__init__.py"
#     with open(init_py_path, "w", encoding="utf-8") as f: # Overwrite __init__.py
#          f.write("# flake8: noqa\n") # Suppress linting errors for unused imports potentially
#          f.write("# Auto-generated __init__.py for ontology models\n\n")
#          f.write("import logging\n")
#          f.write("from pydantic import BaseModel\n")
#          f.write("import importlib\n")
#          f.write("import pkgutil\n")
#          f.write("from typing import TYPE_CHECKING\n\n") # Needed for the rebuild function
#          # Import base_types explicitly
         
#          f.write("logger = logging.getLogger(__name__)\n\n")
#          f.write("try:\n")
#          f.write("    from .base_types import Quantity, Distance, Duration, DefinedTerm\n")
#          f.write("except ImportError:\n")
#          f.write("    logger.warning('Could not import base_types')\n\n")

#          # Import all models explicitly for easier use and rebuild
#          for module_name_part in sorted(list(all_generated_modules)):
#              try:
#                  # Attempt to guess class name from module name (needs consistent mapping)
#                  # This is fragile - better to pass class_name list explicitly
#                  class_name_guess = map_uri_to_classname(rdflib.URIRef(f"{SCHEMA}{module_name_part}")) # Reverse mapping attempt
#                  if class_name_guess in all_class_names:
#                      f.write(f"from .{module_name_part} import {class_name_guess}\n")
#                  else: # Fallback if reverse mapping failed
#                      f.write(f"try:\n")
#                      f.write(f"    from .{module_name_part} import *\n") # Less ideal, try to import *
#                      f.write(f"except ImportError:\n")
#                      f.write(f"    logger.warning(f'Could not import from .{module_name_part}')\n")

#              except Exception:
#                  logger.warning(f"Could not generate import for module {module_name_part}")


#          # The rebuild function (unchanged from previous version)
#          f.write("""
# # --- Rebuild models to resolve forward references ---
# def rebuild_all() -> None:
#     package_name = __name__
#     package = importlib.import_module(package_name)
#     rebuilt_models = set()
#     logger.debug(f"Attempting rebuild in {package_name}")

#     for _, module_name_part, _ in pkgutil.iter_modules(package.__path__, package_name + '.'):
#         try:
#             module = importlib.import_module(module_name_part)
#             for attribute_name in dir(module):
#                 attribute = getattr(module, attribute_name)
#                 # Check if it's a class, subclass of BaseModel, not BaseModel itself, and not rebuilt yet
#                 if (isinstance(attribute, type) and
#                         issubclass(attribute, BaseModel) and
#                         attribute is not BaseModel and
#                         attribute not in rebuilt_models):
#                     try:
#                         logger.debug(f"Rebuilding: {attribute.__name__}")
#                         attribute.model_rebuild(force=True)
#                         rebuilt_models.add(attribute)
#                     except Exception as e_rebuild:
#                         logger.error(f'Error rebuilding model {attribute.__name__} in {module_name_part}: {e_rebuild}', exc_info=False) # Reduce noise
#         except ModuleNotFoundError:
#             logger.warning(f"Module not found during rebuild: {module_name_part}")
#         except Exception as e_import:
#              logger.error(f'Error importing module {module_name_part} during rebuild: {e_import}', exc_info=False) # Reduce noise

# # Run rebuild automatically on import
# try:
#     rebuild_all()
#     logger.info(f'Pydantic models in {__name__} rebuilt.')
# except Exception as e_global:
#     logger.error(f'Global error during model rebuild: {e_global}', exc_info=True)
# """)


In [188]:
# Step 4: --- Main execution block demonstrating generation ---
if __name__ == "__main__":
    # Assume schema_graph is loaded from Step 1
    # Assume analyzed_schema is created from Step 2
    schema_graph = parse_schema_to_graph(SCHEMA_FILE, SCHEMA_FORMAT)
    if schema_graph:
        analyzed_schema = analyze_schema_graph(schema_graph)
        if analyzed_schema and analyzed_schema.get("classes"):
            # Step 4: Generate the Pydantic models
            generate_ontology_models(analyzed_schema, OUTPUT_DIR, MODELS_SUBDIR)
            logging.info(f"Pydantic model code generation complete. Check the '{OUTPUT_DIR}/{MODELS_SUBDIR}' directory.")
            logging.info("Ready for Step 5: Post-Processing & Verification.")
        else:
             logging.error("Schema analysis did not produce class/property data.")
    else:
        logging.error("Failed to parse schema graph for generation.")

INFO:root:Attempting to parse schema file: schema.txt (format: turtle)
INFO:root:Successfully parsed 8904 triples.
INFO:root:Found 628 potential schema.org classes.
INFO:root:Found 921 potential schema.org properties.
INFO:root:Analyzed 628 classes and 921 properties.
INFO:root:Generated base types file: output_ontology/models/base_types.py
INFO:root:Generating models in alphabetical order by URI.
INFO:root:Generated 625 specific Pydantic model files in output_ontology/models
INFO:root:Successfully generated __init__.py at output_ontology/models/__init__.py
INFO:root:Pydantic model code generation complete. Check the 'output_ontology/models' directory.
INFO:root:Ready for Step 5: Post-Processing & Verification.


In [181]:
if __name__ == "__main__":
    # Placeholder registry
    known_generated_classes = {"Person", "DefinedTerm", "Distance", "Duration", "Quantity"}

    # Example: Map range for schema:educationalUse (includes DefinedTerm)
    edu_use_ranges = {SCHEMA.DefinedTerm, SCHEMA.Text}
    edu_use_type_hint = map_range_to_typehint(edu_use_ranges, known_generated_classes)
    # print(f"Mapping for schema:educationalUse ranges ({edu_use_ranges}):")
    # Expect Optional[Union['DefinedTerm', str]]
    # print(f" -> Type Hint: {edu_use_type_hint}")

    # Example: Map range for schema:distance (can be Distance)
    distance_ranges = {SCHEMA.Distance}
    distance_type_hint = map_range_to_typehint(distance_ranges, known_generated_classes)
    # print(f"\nMapping for schema:distance ranges ({distance_ranges}):")
    # Expect Optional['Distance']
    # print(f" -> Type Hint: {distance_type_hint}")

    # Example: Map range for schema:duration (can be Duration)
    duration_ranges = {SCHEMA.Duration}
    duration_type_hint = map_range_to_typehint(duration_ranges, known_generated_classes)
    # print(f"\nMapping for schema:duration ranges ({duration_ranges}):")
    # Expect Optional['Duration']
    # print(f" -> Type Hint: {duration_type_hint}")

    # Ensure necessary imports would be generated by Step 4
    print("\nRequired imports for these types include:")
    print("from pydantic import BaseModel, Field, AnyUrl")
    print("from typing import Optional, List, Union")
    print("from datetime import date, datetime, time, timedelta")
    print("import isodate")
    print("import decimal")
    print("# ... potentially other generated models")

    print("\nRicher mapping logic defined. Re-running Step 4 will use these structured types.")
    print("Ready for Step 5: Post-Processing & Verification.")


Required imports for these types include:
from pydantic import BaseModel, Field, AnyUrl
from typing import Optional, List, Union
from datetime import date, datetime, time, timedelta
import isodate
import decimal
# ... potentially other generated models

Richer mapping logic defined. Re-running Step 4 will use these structured types.
Ready for Step 5: Post-Processing & Verification.


Step 5: Post processing

In [None]:
# Example: postprocess.py

import subprocess
import sys
import logging
import pathlib
import shlex # Use shlex for safer command construction

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Define target directories (adjust paths as needed)
TARGET_DIR = pathlib.Path("output_ontology") # Main output directory
MODELS_DIR = TARGET_DIR / "models" # Specific directory with models
TESTS_DIR = TARGET_DIR / "tests"  # Directory for tests

# Tool Configurations (Ideally read from pyproject.toml or similar)
# Example: Fail bandit on medium+ severity, medium+ confidence
BANDIT_ARGS = ["-r", str(MODELS_DIR), "-ll", "--exit-zero"] # Exit-zero initially to report, fail later based on output if needed
# Example: MyPy needs strict checks
MYPY_ARGS = ["--strict", str(MODELS_DIR)]
# Example: Ruff - fix specific issues, check others
RUFF_FIX_ARGS = ["check", str(TARGET_DIR), "--fix", "--select", "F", "--select", "E", "--select", "W", "--select", "I"] # Fix F401, E, W, I (Imports etc)
RUFF_CHECK_ARGS = ["check", str(TARGET_DIR)] # Check everything else

def run_command(command: list[str], cwd: str = ".", check: bool = True, capture: bool = False) -> subprocess.CompletedProcess | None:
    """Runs a command, logs, checks for errors, optionally captures output."""
    command_str = shlex.join(command) # Safer than " ".join
    logging.info(f"Running: {command_str}")
    try:
        result = subprocess.run(
            command,
            capture_output=capture,
            text=True,
            check=check, # Raises CalledProcessError if return code is non-zero
            cwd=cwd,
            encoding='utf-8'
        )
        if capture:
            if result.stdout: logging.debug(f"Stdout:\n{result.stdout}")
            if result.stderr: logging.debug(f"Stderr:\n{result.stderr}") # Debug level for stderr unless error occurred
        logging.info(f"Command succeeded: {command_str}")
        return result
    except FileNotFoundError:
        logging.error(f"Error: Command not found: {command[0]}. Is the tool installed and in PATH?")
        raise # Re-raise to stop the pipeline
    except subprocess.CalledProcessError as e:
        logging.error(f"Command failed with exit code {e.returncode}: {command_str}")
        if capture: # Log captured output on error
             if e.stdout: logging.error(f"Failed command stdout:\n{e.stdout}")
             if e.stderr: logging.error(f"Failed command stderr:\n{e.stderr}")
        raise # Re-raise to stop the pipeline
    except Exception as e:
        logging.error(f"An unexpected error occurred running {command_str}: {e}")
        raise # Re-raise to stop the pipeline

def run_post_processing_pipeline():
    """
    Executes a robust post-processing and verification pipeline
    for the generated ontology models.
    Stops immediately if any critical step fails.
    """
    logging.info("--- Starting Robust Post-Processing & Verification Pipeline ---")
    try:
        # Preparation: Ensure directories and necessary files exist
        MODELS_DIR.mkdir(parents=True, exist_ok=True)
        (MODELS_DIR / "py.typed").touch()
        TESTS_DIR.mkdir(parents=True, exist_ok=True)
        (TESTS_DIR / "__init__.py").touch()
        logging.info("Directories and marker files ensured.")

        # Stage 1: Formatting (Fail on error)
        # Using Ruff format as an example, Black is also fine
        logging.info("Stage 1: Formatting...")
        run_command(["ruff", "format", str(TARGET_DIR)], check=True)

        # Stage 2: Linting & Auto-Fixing (Fail on error)
        # Run fix for specific safe-to-fix categories first
        logging.info("Stage 2a: Linting & Auto-Fixing (Imports, Syntax, Style)...")
        run_command(["ruff", "check", str(TARGET_DIR), "--fix", "--select", "F", "--select", "E", "--select", "W", "--select", "I", "--exit-zero-even-if-fixed"], check=True)
        # Run check for remaining issues (don't auto-fix things like complexity)
        logging.info("Stage 2b: Linting (Remaining Checks)...")
        run_command(["ruff", "check", str(TARGET_DIR)], check=True) # Fail if non-fixable errors remain

        # Stage 3: Static Type Checking (Fail on error)
        logging.info("Stage 3: Type Checking...")
        run_command(["mypy"] + MYPY_ARGS, check=True)

        # Stage 4: Security Scanning (Fail on error - check output if needed)
        logging.info("Stage 4: Security Scanning...")
        # Run bandit, capture output, decide later if specific findings should fail the build
        bandit_result = run_command(["bandit"] + BANDIT_ARGS, check=False, capture=True)
        # Example: Fail build only if Bandit found issues and exited non-zero (if not using --exit-zero)
        # Or parse bandit_result.stdout (JSON format possible) for high-severity issues
        # For now, we just log it based on its exit code (using --exit-zero means it won't fail the script here)
        if bandit_result and bandit_result.returncode != 0:
             logging.warning("Bandit found issues, but configured not to fail the build (--exit-zero). Review output.")
             # If not using --exit-zero, the run_command would have raised an error if issues found.

        # Stage 5: Testing (Fail on error)
        logging.info("Stage 5: Testing...")
        logging.warning("Ensure tests are present in " + str(TESTS_DIR))
        run_command(["pytest", str(TESTS_DIR)], check=True)

        logging.info("--- Pipeline Completed Successfully ---")

    except (subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
        # Error already logged by run_command
        logging.critical("--- Pipeline Failed ---")
        sys.exit(1) # Exit with non-zero code to signal failure

if __name__ == "__main__":
    run_post_processing_pipeline()