This notebook generates the perturbed datasets from clean hosp

The outputs will be the three datasets:
- ../output/ecar_typo_hosp_tab_err.csv
- ../output/enar_typo_hosp_tab_err.csv
- ../output/ear_typo_hosp_tab_err.csv

The error rate matches the BART error rate of 0.172 -- could change...

Typo is similar to the add 3 *s from BART's "dirty strategies"

In [20]:
import pandas as pd

from tab_err import error_type
from tab_err import error_mechanism

from tab_err.api import high_level

General Setup

In [21]:
clean_df = pd.read_csv("../../../data/hospital/clean.csv")

# Convert all columns except the first by name
cols_to_convert = clean_df.columns[1:]
clean_df[cols_to_convert] = clean_df[cols_to_convert].astype(str)

seed = 1234

error_rate = 0.18  # Adjust as necessary

error_types_to_include = [error_type.Typo()]

ECAR

In [22]:
error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ENAR()]

ecar_df, m = high_level.create_errors(
    clean_df,
    error_rate=error_rate,
    error_types_to_include=error_types_to_include,
    error_mechanisms_to_exclude=error_mechanisms_to_exclude,
    seed=seed
)

print(f"Actual error rate: {m.mean(axis=None)}")

ecar_df.to_csv("../output/ecar_typo_hosp_tab_err.csv")
m.to_csv("../output/ecar_typo_mask.csv")

Actual error rate: 0.171


  col_num_models = _build_column_number_of_models_dictionary(data=data, column_types=col_type, column_mechanisms=col_mechanisms)


ENAR

In [23]:
error_mechanisms_to_exclude=[error_mechanism.EAR(), error_mechanism.ECAR()]

enar_df, m = high_level.create_errors(
    clean_df,
    error_rate=error_rate,
    error_types_to_include=error_types_to_include,
    error_mechanisms_to_exclude=error_mechanisms_to_exclude,
    seed=seed
)

print(f"Actual error rate: {m.mean(axis=None)}")

enar_df.to_csv("../output/enar_typo_hosp_tab_err.csv")
m.to_csv("../output/enar_typo_mask.csv")

Actual error rate: 0.171


  col_num_models = _build_column_number_of_models_dictionary(data=data, column_types=col_type, column_mechanisms=col_mechanisms)


EAR

In [24]:
error_mechanisms_to_exclude=[error_mechanism.ECAR(), error_mechanism.ENAR()]

ear_df, m = high_level.create_errors(
    clean_df,
    error_rate=error_rate,
    error_types_to_include=error_types_to_include,
    error_mechanisms_to_exclude=error_mechanisms_to_exclude,
    seed=seed
)

print(f"Actual error rate: {m.mean(axis=None)}")

ear_df.to_csv("../output/ear_typo_hosp_tab_err.csv")
m.to_csv("../output/ear_typo_mask.csv")

  col_num_models = _build_column_number_of_models_dictionary(data=data, column_types=col_type, column_mechanisms=col_mechanisms)


Actual error rate: 0.171


ECAR with midlevel config

BART run 1 error proportions
index               0.000
providernumber      0.000
hospitalname        0.188
address1            0.275
address2            0.000
address3            0.000
city                0.311
state               0.759
zipcode             0.103
countyname          0.529
phonenumber         0.201
hospitaltype        0.000
hospitalowner       0.112
emergencyservice    0.640
condition           0.300
measurecode         0.030
measurename         0.100
score               0.000
sample              0.000
stateaverage        0.069

In [25]:
clean_df.head(2)

Unnamed: 0,index,ProviderNumber,HospitalName,Address1,Address2,Address3,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score,Sample,StateAverage
0,1,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs c...,empty,empty,al_scip-card-2
1,2,10018,callahan eye foundation hospital,1720 university blvd,empty,empty,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic ...,empty,empty,al_scip-inf-1


In [26]:
from tab_err import ErrorModel, error_mechanism, error_type
from tab_err.api import MidLevelConfig, mid_level

config = MidLevelConfig(
    {
        "HospitalName":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.188
            )
        ],
        "Address1":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.275
            )
        ],
        "City":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.311
            )
        ],
        "State":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.759
            )
        ],
        "ZipCode":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.103
            )
        ],
        "CountyName":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.529
            )
        ],
        "PhoneNumber":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.201
            )
        ],
        "HospitalOwner":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.112
            )
        ],
        "EmergencyService":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.640
            )
        ],
        "Condition":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.300
            )
        ],
        "MeasureCode":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.030
            )
        ],
        "MeasureName":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.100
            )
        ],
        "StateAverage":[
            ErrorModel(
                error_mechanism=error_mechanism.ECAR(),
                error_type=error_type.Typo(),
                error_rate=0.069
            )
        ]
    }
)

corrupted_data, m = mid_level.create_errors(data=clean_df, config=config)

print(f"Actual error rate: {m.mean(axis=None)}")

corrupted_data.to_csv("../output/mid_level_HOSP.csv")
m.to_csv("../output/mid_level_HOSP_mask.csv")

Actual error rate: 0.18085
