# Distance Table Component from GEF
Read the file and then dump its content to view the raw dataset

In [None]:
from pygef.cpt import CPTData
from evo.data_converters.gef.importer.parse_gef_files import parse_gef_files
from pprint import pprint

test_data_dir = "detect-gef-file/data/input"

cpt_file = f"{test_data_dir}/cpt.gef"
result = parse_gef_files([cpt_file])
assert isinstance(result, dict)
assert len(result) == 1
for v in result.values():
    assert isinstance(v, CPTData)
print("CPT DICT:")
pprint(result)

# Multiple GEF datasets, as a dict keyed by hole_id
parsed_cpt_files = result

In [None]:
from evo.notebooks import ServiceManagerWidget

manager = await ServiceManagerWidget.with_auth_code(
    client_id="native-Aesb2WinpeP3ldYfhdSUfpc4x",
    base_uri="https://qa-ims.bentley.com",
    discovery_url="https://uat-api.test.seequent.systems",
).login()

In [None]:
from evo.data_converters.common import create_evo_object_service_and_data_client

object_service_client, data_client = create_evo_object_service_and_data_client(service_manager_widget=manager)

In [None]:
from evo.objects.utils.data import ObjectDataClient
from evo_schemas.components import (
    DistanceTable_V1_2_0 as DistanceTable,
    DistanceTable_V1_2_0_Distance as Distance,
    CategoryAttribute_V1_1_0 as CategoryAttribute,
    ContinuousAttribute_V1_1_0 as ContinuousAttribute,
    NanCategorical_V1_0_1 as NanCategorical,
    NanContinuous_V1_0_1 as NanContinuous,
)
from evo_schemas.elements import (
    FloatArray1_V1_0_1 as FloatArray1,
    IntegerArray1_V1_0_1 as IntegerArray1,
    UnitLength_V1_0_1_UnitCategories as UnitLength_UnitCategories,
    LookupTable_V1_0_1 as LookupTable,
)
import pyarrow as pa
import pandas as pd

In [None]:
def create_category_lookup_and_data(column: pd.Series) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Create a category lookup table and a data column with mapped values.

    Args:
        column (pd.Series): The column to create the lookup table and data column from.

    Returns:
        lookup_df (pd.DataFrame): The category lookup table.
        values_df (pd.DataFrame): The data column with mapped values.
    """
    set_obj = set(column["data"])
    list_obj = list(set_obj)
    list_obj.sort()
    num_unique_elements = len(list_obj)

    # Create lookup table
    lookup_df = pd.DataFrame(list())
    lookup_df["key"] = [i for i in range(1, num_unique_elements + 1)]
    lookup_df["value"] = list_obj

    # Create data column
    values_df = pd.DataFrame(list())
    values_df["data"] = column["data"].map(lookup_df.set_index("value")["key"])
    return lookup_df, values_df

In [None]:
def _create_categorical_attribute_component(
    key: str, name: str, source_df: pd.DataFrame, data_client: ObjectDataClient
) -> CategoryAttribute:
    table_df, values_df = create_category_lookup_and_data(source_df[name])

    schema = pa.schema([("key", pa.int64()), ("value", pa.string())])
    table = pa.Table.from_pandas(table_df, schema=schema)
    lookup_table_args = data_client.save_table(table)
    lookup_table_go = LookupTable.from_dict(lookup_table_args)

    schema = pa.schema([("data", pa.int64())])
    table = pa.Table.from_pandas(values_df, schema=schema)
    int_array_args = data_client.save_table(table)
    int_array_go = IntegerArray1.from_dict(int_array_args)

    return CategoryAttribute(
        key=name,
        name=name,
        nan_description=NanCategorical(values=[]),
        table=lookup_table_go,
        values=int_array_go,
    )

In [None]:
def _create_continuous_attribute_component(
    key: str, name: str, source_df: pd.DataFrame, data_client: ObjectDataClient
) -> ContinuousAttribute:
    array_values = source_df[name]
    schema = pa.schema([("data", pa.float64())])
    values_df = pd.DataFrame(array_values, columns=["data"])
    table = pa.Table.from_pandas(values_df, schema=schema)
    float_array_args = data_client.save_table(table)
    float_array_go = FloatArray1.from_dict(float_array_args)

    return ContinuousAttribute(
        key=name,
        name=name,
        nan_description=NanContinuous(values=[]),
        values=float_array_go,
    )

In [None]:
from pygef.cpt import CPTData


def _create_dc_distances_table(hole_id, cpt: CPTData, data_client: ObjectDataClient) -> DistanceTable | None:
    distances_schema = pa.schema([pa.field("values", pa.float64())])
    distances_table = pa.Table.from_arrays(
        [pa.array(cpt.data["penetrationLength"], type=pa.float64())], schema=distances_schema
    )
    distances_args = data_client.save_table(distances_table)
    distances_go = FloatArray1.from_dict(distances_args)

    PROPERTIES_TO_IGNORE = [
        "penetrationLength",
        "From",
        "To",
        "hole_ID",
    ]
    CATEGORICAL_PROPERTIES = [
        "SBT",
        "SBTn",
        "cluster",
    ]

    # Distance table attributes
    attributes = []

    columns = cpt.data.columns
    for column_name in columns:
        if column_name not in PROPERTIES_TO_IGNORE:
            if column_name in CATEGORICAL_PROPERTIES:
                try:
                    attribute_go = _create_categorical_attribute_component(
                        column_name,
                        column_name,
                        cpt.data,
                        data_client,
                    )
                except pd.errors.InvalidIndexError:
                    print(f"Could not create {column_name} categorical attribute component.")
            else:
                attribute_go = _create_continuous_attribute_component(
                    column_name,
                    column_name,
                    cpt.data,
                    data_client,
                )
            if isinstance(attribute_go, (ContinuousAttribute, CategoryAttribute)):
                attributes.append(attribute_go)

    distances_unit = UnitLength_UnitCategories("m")

    distance_go = Distance(
        attributes=attributes,
        unit=distances_unit,
        values=distances_go,
    )

    distance_table_go = DistanceTable(name=hole_id, distance=distance_go)
    return distance_table_go

All the columns in this GEF dataset

In [None]:
for hole_id, cpt_data in parsed_cpt_files.items():
    pprint(cpt_data.columns)

Create distance table components for all holes in this GEF dataset

In [None]:
for hole_id, cpt_data in parsed_cpt_files.items():
    distance_table = _create_dc_distances_table(hole_id, cpt_data, data_client)
    pprint(distance_table)

TODO:
1) Adjust so we process multiple GEF files
2) Finish with top-level 'collections' (DownholeAttributes) component, which contains the 'holes' (HoleChunks) data.