# Modules


## Configuration

In [1]:
!mkdir configuration && touch configuration/__init__.py

In [None]:
%%writefile configuration/main.py

import json
import os
import time
import re
import glob
import yaml
from datetime import datetime

DEFAULT_CONFIG_DIR: str = '../config.yaml' # Google drive: "/content/drive/MyDrive/cfr2sbvr/config.yaml

def get_next_filename(file_dir: str, file_prefix: str, extension: str) -> str:
    """
    Generates the next filename in a sequence based on existing files in a directory,
    considering the file extension.

    The filename format is: `{file_prefix}-{YYYY-MM-DD}-{N}.{extension}`,
    where `N` is an incrementing integer for files with the same date.

    Args:
        file_dir (str): The directory where the files are stored.
        file_prefix (str): The prefix used in the filenames.
        extension (str): The file extension (e.g., 'json', 'txt').

    Returns:
        str: The full path to the next filename in the sequence.

    Example:
        next_file = get_next_filename(DEFAULT_CHECKPOINTS_DIR, 'documents', 'json')
        print(next_file)
        # Output might be: ../checkpoints/documents-2024-10-19-5.json
    """
    today_str: str = datetime.today().strftime('%Y-%m-%d')
    path: str = file_dir

    # Ensure the directory exists
    if not os.path.exists(path):
        os.makedirs(path)

    files = os.listdir(path)

    # Create the pattern dynamically using file_prefix and extension
    pattern = re.compile(
        r'^' + re.escape(file_prefix) + r'-(\d{4}-\d{2}-\d{2})-(\d+)\.' + re.escape(extension) + r'$'
    )

    file_info_list = []

    for filename in files:
        match = pattern.match(filename)
        if match:
            date_str: str = match.group(1)
            number: int = int(match.group(2))
            file_info_list.append({'filename': filename, 'date': date_str, 'number': number})

    if file_info_list:
        # Sort by date and number in descending order
        sorted_files = sorted(
            file_info_list,
            key=lambda x: (x['date'], x['number']),
            reverse=True
        )

        latest_file_info = sorted_files[0]
        latest_date: str = latest_file_info['date']
        latest_number: int = latest_file_info['number']

        if latest_date == today_str:
            new_number: int = latest_number + 1
        else:
            new_number = 1
    else:
        new_number = 1

    new_filename: str = f'{file_prefix}-{today_str}-{new_number}.{extension}'
    new_filepath: str = os.path.join(path, new_filename)

    return new_filepath


# Load the YAML config file
def load_config(config_file: str = None):
    if config_file is None:
        config_file = DEFAULT_CONFIG_DIR
    try:
        with open(config_file, "r") as file:
            config = yaml.safe_load(file)
    except FileNotFoundError:
        raise FileNotFoundError(f"Configuration file {config_file} not found.")
    except yaml.YAMLError as exc:
        raise ValueError(f"Error parsing YAML file {config_file}: {exc}")

    # Ensure config structure is correct
    if "LLM" not in config or "DEFAULT_CHECKPOINT_DIR" not in config:
        raise ValueError("Required configuration keys are missing in the config file.")

    # Set the OpenAI API key from environment variable if it's not set in config
    config["LLM"]["OPENAI_API_KEY"] = os.getenv(
        "OPENAI_API_KEY", config["LLM"].get("OPENAI_API_KEY")
    )

    # Dynamically set checkpoint and report files using the get_next_filename function
    config["DEFAULT_CHECKPOINT_FILE"] = get_next_filename(
        config["DEFAULT_CHECKPOINT_DIR"], "documents", "json"
    )
    config["DEFAULT_EXTRACTION_REPORT_FILE"] = get_next_filename(
        config["DEFAULT_OUTPUT_DIR"], "extraction_report", "html"
    )

    return config

If running in Google Colab

In [7]:
import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  # backup on Google Drive
  !cp -r configuration /content/drive/MyDrive/cfr2sbvr/modules

### Testing

In [None]:
import configuration.main as configuration

# Development mode
import importlib
importlib.reload(configuration)

Load configuration

In [15]:
config = configuration.load_config()

In [None]:
config

## Checkpoint

Checkpoints are stored / retrieved at the directory `DEFAULT_CHECKPOINT_FILE` in the configuration file.

In [1]:
!mkdir checkpoint && touch checkpoint/__init__.py

In [20]:
%%writefile checkpoint/main.py

from typing import List, Dict, Optional, Any, Tuple, Set
from pydantic import BaseModel, Field
import logging
import json
from json import JSONDecodeError

# Set up basic logging configuration for the checkpoint module
logging.basicConfig(
    level=logging.INFO,  # Set to INFO or another level as needed
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log format
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

def convert_set_to_list(data: Any) -> Any:
    """
    Recursively converts sets to lists in the data structure.

    Args:
        data (Any): The data structure to process, which can be a dict, list, set, or other types.

    Returns:
        Any: The data structure with all sets converted to lists.
    """
    if isinstance(data, dict):
        return {key: convert_set_to_list(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_set_to_list(item) for item in data]
    elif isinstance(data, set):
        return list(data)
    else:
        return data


# Define a model for the Document
class Document(BaseModel):
    id: str
    type: str  # New field to represent the type of the document
    content: Any  # Content can be any data type: list, dict, string, etc.

# Define the DocumentManager class
class DocumentManager(BaseModel):
    documents: Dict[Tuple[str, str], Document] = Field(default_factory=dict)  # Keys are tuples (id, type)

    def add_document(self, doc: Document) -> None:
        """
        Adds a document to the manager.

        Args:
            doc (Document): The document to add.
        """
        key = (doc.id, doc.type)
        self.documents[key] = doc

    def retrieve_document(self, doc_id: str, doc_type: str) -> Optional[Document]:
        """
        Retrieves a document by its id and type.

        Args:
            doc_id (str): The ID of the document.
            doc_type (str): The type of the document.

        Returns:
            Optional[Document]: The retrieved document, or None if not found.
        """
        key = (doc_id, doc_type)
        return self.documents.get(key)

    def list_document_ids(self, doc_type: Optional[str] = None) -> List[str]:
        """
        Lists all document ids, optionally filtered by type.

        Args:
            doc_type (Optional[str], optional): The type of documents to list. Defaults to None.

        Returns:
            List[str]: A list of document ids.
        """
        if doc_type:
            return [doc_id for (doc_id, d_type) in self.documents.keys() if d_type == doc_type]
        else:
            return [doc_id for (doc_id, _) in self.documents.keys()]

    def exclude_document(self, doc_id: str, doc_type: str) -> None:
        """
        Excludes a document by its id and type.

        Args:
            doc_id (str): The ID of the document to exclude.
            doc_type (str): The type of the document.
        """
        key = (doc_id, doc_type)
        if key in self.documents:
            del self.documents[key]

    def persist_to_file(self, filename: str) -> None:
        """
        Persists the current state to a file, converting tuple keys to strings and sets to lists.

        Args:
            filename (str): The filename to save the documents.
        """
        serializable_documents = {f"{doc_id}|{doc_type}": convert_set_to_list(doc.dict()) for (doc_id, doc_type), doc in self.documents.items()}
        with open(filename, 'w') as file:
            json.dump(serializable_documents, file, indent=4)

    @classmethod
    def restore_from_file(cls, filename: str) -> 'DocumentManager':
        """
        Restores the state from a file, converting string keys back to tuples.

        Args:
            filename (str): The filename to restore the documents from.

        Returns:
            DocumentManager: The restored DocumentManager instance.
        """
        with open(filename, 'r') as file:
            data = json.load(file)
            documents = {(doc_id.split('|')[0], doc_id.split('|')[1]): Document(**doc_data) for doc_id, doc_data in data.items()}
            return cls(documents=documents)

def restore_checkpoint(filename: Optional[str]) -> DocumentManager:
    """
    Restores the document manager from a checkpoint file.

    Args:
        filename (str, optional): The path to the checkpoint file. Defaults to DEFAULT_CHECKPOINT_FILE.

    Returns:
        DocumentManager: The restored DocumentManager instance.

    Raises:
        FileNotFoundError: If the checkpoint file does not exist.

    See Also:
        - Reset the values delete the documents.json file and run: manager = DocumentManager()
        - Restore the state from the documents.json file, run: DocumentManager.restore_from_file("documents.json")
        - Exclue a document: manager.exclude_document(doc_id="§ 275.0-2", doc_type="section")
        - List documents: manager.list_document_ids(doc_type="section")
        - Get a document: manager.retrieve_document(doc_id=doc, doc_type="section")
    """

    try:
        restored_docs = DocumentManager.restore_from_file(filename)
        logger.info(f"Checkpoint restored from {filename}.")
    except (FileNotFoundError, JSONDecodeError):
        restored_docs = DocumentManager()
        logger.error(f"Checkpoint file '{filename}' not found or is empty, initializing new checkpoint.")
    return restored_docs

def save_checkpoint(filename: Optional[str], manager: DocumentManager) -> None:
    """
    Saves the current state of the DocumentManager to a checkpoint file.

    Args:
        manager (DocumentManager): The DocumentManager instance to save.

    Raises:
        Exception: If there is an error saving the checkpoint.
    """
    try:
        manager.persist_to_file(filename=filename)
        logger.info("Checkpoint saved.")
    except FileNotFoundError:
        logger.error("Error saving checkpoint. Check the directory path and permissions.")

Overwriting checkpoint/main.py


## Logging

In [23]:
!mkdir logging_setup && touch logging_setup/__init__.py

In [24]:
%%writefile logging_setup/main.py

from pathlib import Path
import logging
from logging.handlers import TimedRotatingFileHandler


def setting_logging(log_path: str, log_level: str):
    # Ensure the ../logs directory exists
    log_directory = Path.cwd() / log_path
    log_directory.mkdir(parents=True, exist_ok=True)

    # Path for the log file
    log_file_path = log_directory / "application.log"

    # Set up TimedRotatingFileHandler to rotate logs every day
    file_handler = TimedRotatingFileHandler(
        log_file_path,
        when="midnight",
        interval=1,
        backupCount=0,  # Rotate every midnight, keep all backups
    )

    # Set the file handler's log format
    file_handler.setFormatter(
        logging.Formatter(
            "%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
        )
    )

    # Set up logging configuration
    logging.basicConfig(
        level=log_level,  # Set to the desired log level
        format="%(asctime)s - %(levelname)s - %(message)s",  # Console log format
        datefmt="%Y-%m-%d %H:%M:%S",  # Custom date format
        handlers=[
            file_handler,  # Log to the rotating file in ../logs
            logging.StreamHandler(),  # Log to console
        ],
    )

    # Example logger
    logger = logging.getLogger(__name__)

    # Log a test message to verify
    logger.info("Logging is set up with daily rotation.")

    return logger

Writing logging_setup/main.py
