In [None]:
import logging
import pathlib
import sys

from pandas import DataFrame

from topollm.analysis.compare_sampling_methods.load_and_concatenate_saved_dataframes import (
    load_and_concatenate_saved_dataframes,
)
from topollm.config_classes.constants import TOPO_LLM_REPOSITORY_BASE_PATH
from topollm.typing.enums import Verbosity

# Create a logger
default_logger: logging.Logger = logging.getLogger(name=__name__)
default_logger.setLevel(level=logging.DEBUG)

# Create a stream handler
stream_handler = logging.StreamHandler(stream=sys.stdout)
stream_handler.setLevel(level=logging.DEBUG)

# Create a formatter and attach it to the handler
formatter = logging.Formatter(fmt="[%(asctime)s][%(levelname)8s][%(name)s] %(message)s (%(filename)s:%(lineno)s)")
stream_handler.setFormatter(fmt=formatter)

# Add the handler to the logger
if not default_logger.handlers:  # Avoid adding duplicate handlers in case the cell is re-executed
    default_logger.addHandler(hdlr=stream_handler)

verbosity: Verbosity = Verbosity.NORMAL
logger: logging.Logger = default_logger

# Example usage
logger.debug(msg="This is a debug message.")
logger.info(msg="This is an info message.")

In [None]:
comparisons_folder_base_path = pathlib.Path(
    TOPO_LLM_REPOSITORY_BASE_PATH,
    "data/analysis/sample_sizes/",
    "run_general_comparisons/",
    "array_truncation_size=5000/",
    "analysis/twonn/",
)

concatenated_df: DataFrame = load_and_concatenate_saved_dataframes(
    root_dir=comparisons_folder_base_path,
)

columns_to_investigate: list[str] = [
    "data_full",
    "data_subsampling_full",
    "model_partial_name",
]

for column_name in columns_to_investigate:
    logger.info(msg=30 * "=")
    logger.info(
        msg=f"Unique values in column '{column_name = }':",  # noqa: G004 - low overhead
    )
    logger.info(
        msg=concatenated_df[column_name].unique(),
    )

concatenated_df.info()

In [None]:
concatenated_df["model_full"].unique()

In [None]:
concatenated_df

### You can run different analysis methods here