In [1]:
import sys
import pandas as pd

from harmonization_framework.rule import HarmonizationRule
from harmonization_framework.primitives import Cast, Bin, ConvertUnits, EnumToEnum, Round
from harmonization_framework.rule_store import RuleStore
from harmonization_framework.utils import transformations

### Step 1: Set up logger

In [2]:
import harmonization_framework.replay_log as rlog
logger = rlog.configure_logger(3, "demo.log")

### Step 2: Load and examine data

In [3]:
df_source1 = pd.read_csv(
    "demo_source1.csv",
    index_col=0,
    converters={
        "current_employment_status": int,
        "commute_distance_miles": float,
    },
)
df_source1 = df_source1.drop(["edu_years_of_school", "zip_code_9"], axis=1)

df_source2 = pd.read_csv(
    "demo_source2.csv",
    index_col=0,
    converters={
        "employment": int,
        "commute_distance_km": float,
    },
)
df_source2 = df_source2.drop(["edu_years_of_school", "zip_code_9"], axis=1)

In [4]:
df_source1

Unnamed: 0_level_0,current_employment_status,commute_distance_miles
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,4.7
1,2,6.69
2,1,8.44
3,3,0.0
4,1,4.62


In [5]:
df_source2

Unnamed: 0_level_0,employment,commute_distance_km
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,17.73
1,2,4.45
2,2,9.12
3,3,0.0
4,3,0.0


### Step 3: Define harmonization rules to suit use case.

In [6]:
# spin up a data store for our harmonization rules
rules = RuleStore()

In [7]:
# re-coding from UP to Data Hub data element
employment_rule_up = HarmonizationRule(
    source="current_employment_status",
    target="nih_employment",
    transformation=[
        EnumToEnum({
            1: 0,
            2: 97,
            3: 1,
            4: 4,
            5: 5,
            6: 97,
            7: 2,
            96: 97,
            98: 99,
            99: 98,
        }),
    ]
)
rules.add_rule(employment_rule_up)

# re-coding from rad and Data Hub data element
employment_rule_rad = HarmonizationRule(
    source="employment",
    target="nih_employment",
    transformation=[
        EnumToEnum({
            1: 0,
            2: 0,
            3: 1,
        }),
    ]
)
rules.add_rule(employment_rule_rad)

In [8]:
# kilometer to miles conversion, mainting 2 significant digits
dist_rule_rad = HarmonizationRule(
    source="commute_distance_km",
    target="commute_distance_miles",
    transformation=[
        ConvertUnits(source="kilometers", target="miles"),
        Round(precision=2),
    ]
)
rules.add_rule(dist_rule_rad)

### Step 4: Execute transformations and combine datasets

In [9]:
transformations_up = [
    ("current_employment_status", "nih_employment"),
]
transformations_rad = [
    ("employment", "nih_employment"),
    ("commute_distance_km", "commute_distance_miles"),
]
df_harmonized1 = transformations.harmonize_dataset(df_source1, transformations_up, rules, "up_dataset", logger)
df_harmonized2 = transformations.harmonize_dataset(df_source2, transformations_rad, rules, "rad_dataset", logger)
integrated = transformations.combine_datasets([df_harmonized1, df_harmonized2])

Requested rule: current_employment_status -> nih_employment
Requested rule: employment -> nih_employment
Requested rule: commute_distance_km -> commute_distance_miles


In [10]:
integrated

Unnamed: 0_level_0,nih_employment,commute_distance_miles,source dataset,original_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,4.7,up_dataset,0
1,97,6.69,up_dataset,1
2,0,8.44,up_dataset,2
3,1,0.0,up_dataset,3
4,0,4.62,up_dataset,4
5,0,11.02,rad_dataset,0
6,0,2.77,rad_dataset,1
7,0,5.67,rad_dataset,2
8,1,0.0,rad_dataset,3
9,1,0.0,rad_dataset,4


### Example: save harmonization rules to disk

In [14]:
rules.save("demo_rules.json")

### Example: replay harmonization from log

In [11]:
replay_results = transformations.replay("demo.log", {"up_dataset": df_source1, "rad_dataset": df_source2})
replay_integrated = transformations.combine_datasets(replay_results.values())

Requested rule: current_employment_status -> nih_employment
Requested rule: employment -> nih_employment
Requested rule: commute_distance_km -> commute_distance_miles


In [12]:
replay_integrated

Unnamed: 0_level_0,nih_employment,commute_distance_miles,source dataset,original_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,4.7,up_dataset,0
1,97,6.69,up_dataset,1
2,0,8.44,up_dataset,2
3,1,0.0,up_dataset,3
4,0,4.62,up_dataset,4
5,0,11.02,rad_dataset,0
6,0,2.77,rad_dataset,1
7,0,5.67,rad_dataset,2
8,1,0.0,rad_dataset,3
9,1,0.0,rad_dataset,4


### Example: harmonization rule using categorical bins

In [13]:
# age in text to age range
age_rule = HarmonizationRule(
    source="age_text",
    target="age_range",
    transformation=[
        Cast("text", "integer"),
        Bin([
            [0, [0, 30]],
            [1, [31, 40]],
            [2, [41, 50]],
            [3, [51, 60]],
            [4, [61, 70]],
            [5, [70, sys.maxsize]],
        ]),
    ],
)
rules.add_rule(age_rule)