In [1]:
def build_uschema_from_testdata(df):
    """
    Convert test dataset (columns: omop, table, label)
    into a full U-Schema model with unique entities & attributes.
    """
    entities = {}

    for _, row in df.iterrows():
        omop = row["omop"]

        # Split into tokens
        parts = [p.strip().lower() for p in omop.split("-") if p.strip()]

        entity = parts[0]
        attrs = parts[1:]  # may contain multiple attributes

        if entity not in entities:
            entities[entity] = set()

        # Add attributes (unique)
        for a in attrs:
            entities[entity].add(a)

    # --- build final U-Schema ---
    uschema_entities = []

    for entity, attrs in entities.items():
        uschema_entities.append({
            "EntityType": {
                "name": entity,
                "variations": [
                    {
                        "StructuralVariation": {
                            "properties": [
                                {
                                    "Attribute": [
                                        {
                                            "name": attr,
                                            "type": "string"
                                        }
                                        for attr in sorted(attrs)
                                    ]
                                }
                            ]
                        }
                    }
                ]
            }
        })

    return {
        "uSchemaModel": {
            "entities": uschema_entities
        }
    }


In [2]:
import pandas as pd
df = pd.read_excel("omop_mimic_data.xlsx")

uschema_json = build_uschema_from_testdata(df)



In [3]:
uschema_json

{'uSchemaModel': {'entities': [{'EntityType': {'name': 'person',
     'variations': [{'StructuralVariation': {'properties': [{'Attribute': [{'name': 'birth_datetime',
            'type': 'string'},
           {'name': 'care_site_id', 'type': 'string'},
           {'name': 'day_of_birth', 'type': 'string'},
           {'name': 'death_datetime', 'type': 'string'},
           {'name': 'ethnicity_concept_id', 'type': 'string'},
           {'name': 'ethnicity_source_concept_id', 'type': 'string'},
           {'name': 'ethnicity_source_value', 'type': 'string'},
           {'name': 'gender_concept_id', 'type': 'string'},
           {'name': 'gender_source_concept_id', 'type': 'string'},
           {'name': 'gender_source_value', 'type': 'string'},
           {'name': 'location_id', 'type': 'string'},
           {'name': 'month_of_birth', 'type': 'string'},
           {'name': 'person_id', 'type': 'string'},
           {'name': 'person_source_value', 'type': 'string'},
           {'name': 'pr

In [4]:
import json

output_path = "uschema_testdata.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(uschema_json, f, ensure_ascii=False, indent=2)

print(f"U-Schema saved to {output_path}")


U-Schema saved to uschema_testdata.json
