# Dataset Generation
Generate erroneous datasets.

In [1]:
import numpy as np
import pandas as pd
import error_generation
from utils import get_differences

In [2]:
from error_generation.api import mid_level
from error_generation.utils import ErrorModel, ErrorTypeConfig, MidLevelConfig
from error_generation.error_mechanism import EAR, ENAR, ECAR
from error_generation.utils import ErrorTypeConfig
from error_generation import error_type

In [3]:
def read_csv_dataset(dataset_path):
    """
    This method reads a dataset from a csv file path.
    """
    dataframe = pd.read_csv(dataset_path, sep=",", header="infer", encoding="utf-8", dtype=str,
                                keep_default_na=False, low_memory=False)
    return dataframe

## Hospital

In [4]:
df_clean = read_csv_dataset('../data/hospital/clean.csv')
df_dirty = read_csv_dataset('../data/hospital/dirty.csv')

df_clean_str = read_csv_dataset('../data/hospital/clean.csv').astype(str)
df_dirty_str = read_csv_dataset('../data/hospital/dirty.csv').astype(str)

Make some changes such that the HOSP dataset from Baran matches the dataset from Xu et al.

In [5]:
df_clean = df_clean.rename(columns={'StateAverage': 'Stateavg'})
df_dirty = df_dirty.rename(columns={'StateAverage': 'Stateavg'})
df_clean_str = df_clean_str.rename(columns={'StateAverage': 'Stateavg'})
df_dirty_str = df_dirty_str.rename(columns={'StateAverage': 'Stateavg'})

df_clean = df_clean.drop('index', axis=1)
df_dirty = df_dirty.drop('index', axis=1)
df_clean_str = df_clean_str.drop('index', axis=1)
df_dirty_str = df_dirty_str.drop('index', axis=1)

In [6]:
def find_replacement_rules(df, column):
    # Initialize a Counter to store rules and their counts
    replacement_rules = Counter()
    
    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        clean = row[f'{column}_clean']
        dirty = row[f'{column}_dirty']
        
        # Check character-by-character for differences
        i = 0
        while i <= len(clean):
            if dirty[i] == 'x' and clean[i] != 'x':  # Found a character in clean replaced by 'x'
                replacement_rules[clean[i]] += 1
                i = len(clean)
            i += 1
    
    return replacement_rules

In [7]:
from collections import Counter

rules = {}

for column in df_clean.columns:
    df_diff = get_differences(df_clean_str, df_dirty_str, column)
    r = find_replacement_rules(df_diff, column)
    rules[column] = r

**Realistic dtypes**
The first step to generating realistic errors is assuming realistic dtypes.

In [8]:
df_clean['ProviderNumber'] = df_clean['ProviderNumber'].astype('int')
df_clean['HospitalName'] = df_clean['HospitalName'].astype('str')
df_clean['Address1'] = df_clean['Address1'].astype('str')
df_clean['Address2'] = df_clean['Address2'].astype('str')
df_clean['Address3'] = df_clean['Address3'].astype('str')
df_clean['City'] = df_clean['City'].astype('str')
df_clean['State'] = df_clean['State'].astype('category')
df_clean['ZipCode'] = df_clean['ZipCode'].astype('category')
df_clean['CountyName'] = df_clean['CountyName'].astype('str')
df_clean['PhoneNumber'] = df_clean['PhoneNumber'].astype('int')
df_clean['HospitalType'] = df_clean['HospitalType'].astype('category')
df_clean['HospitalOwner'] = df_clean['HospitalOwner'].astype('category')
df_clean['EmergencyService'] = df_clean['EmergencyService'].astype('category')
df_clean['Condition'] = df_clean['Condition'].astype('category')
df_clean['MeasureCode'] = df_clean['MeasureCode'].astype('category')
df_clean['MeasureName'] = df_clean['MeasureName'].astype('str')
df_clean['Score'] = df_clean['Score'].astype('str')
df_clean['Sample'] = df_clean['Sample'].astype('str')
df_clean['Stateavg'] = df_clean['Stateavg'].astype('str')

## Replica

In [9]:
n_rows, _ = df_clean.shape
mid_lvl_config = {k: [(ErrorModel(ECAR(), error_type.Replace({'replace_what': v, 'replace_with': 'x'}), rules[k][v]/n_rows)) for v in rules[k]] for k in rules}

In [10]:
config = MidLevelConfig(mid_lvl_config)

In [11]:
for i in range(10):
    df_corrupted, error_mask = mid_level.create_errors(df_clean_str, config)
    df_corrupted.to_csv(f'../export_data/hospital/hospital_replica_{i}.csv', index=False)
df_clean_str.to_csv('../export_data/hospital/hospital_replica_clean.csv', index=False)

### Realistic Error Scenarios

### 1) Realistic Errors Completely At Random in HOSP

In [12]:
error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]  # percentage of errors per column
mid_lvl_config_butter_ecar = {column: [ErrorModel(ECAR(), error_type.Butterfinger(), float(error_percentages[column]))] for column in df_clean_str.columns}
config_butter_ecar = MidLevelConfig(mid_lvl_config_butter_ecar)

In [13]:
for i in range(10):
    df_corrupted, error_mask = mid_level.create_errors(df_clean_str, config_butter_ecar)
    df_corrupted.to_csv(f'../export_data/hospital/hospital_butter_ecar_{i}.csv', index=False)
df_clean_str.to_csv('../export_data/hospital/hospital_butter_ecar_clean.csv', index=False)

### 2) Realistic Erroneous At Random in HOSP

In [14]:
for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + 1, base_seed + 2, base_seed + 3]
    
    mid_lvl_config_ear_realistic = {
        'Address1': [ErrorModel(EAR(condition_to_column='HospitalOwner', seed=seeds[0]), error_type.Butterfinger(), 0.1696666)],
        'ZipCode': [ErrorModel(EAR(condition_to_column='HospitalOwner', seed=seeds[1]), error_type.Mislabel(), 0.1696666)],
        'State': [ErrorModel(EAR(condition_to_column='HospitalOwner', seed=seeds[2]), error_type.Mislabel(), 0.1696666)],
    }
    ear_realistic = MidLevelConfig(mid_lvl_config_ear_realistic)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, ear_realistic)
    df_corrupted.to_csv(f'../export_data/hospital/hospital_ear_realistic_{i}.csv', index=False)
df_clean_str.to_csv('../export_data/hospital/hospital_ear_realistic_clean.csv', index=False)

In [15]:
(df_corrupted != df_clean).sum().sum()

np.int64(507)

### 3) Realistic Erroneous Not At Random in HOSP

In [16]:
def parse_numbers(df):
    # Parse columns Score and Sample to integers
    df['Score'] = df['Score'].replace('empty', '-99%').apply(lambda x: int(x.split('%')[0])).astype('Int64').replace(-99, None)
    df['Sample'] = df['Sample'].replace('empty', '-99 patients').apply(lambda x: int(x.split(' patients')[0])).astype('Int64').replace(-99, None)
    return df

def serialize_strings(df):
    # Serialize columns Score and Sample to strings
    df['Score'] = df['Score'].apply(lambda x: f'{int(x)}%' if not np.isnan(x) else x).fillna('empty')
    df['Sample'] = df['Sample'].apply(lambda x: f'{int(x)} patients' if not np.isnan(x) else x).fillna('empty')
    return df

In [17]:
df_clean_realistic_assumptions = df_clean.copy()

In [18]:
df_clean_realistic_assumptions = parse_numbers(df_clean_realistic_assumptions)

In [19]:
for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + 1, base_seed + 2, base_seed + 3]
    
    mid_lvl_config_enar = {
        'Score': [ErrorModel(ENAR(seed=seeds[0]), error_type.Outlier({'outlier_coefficient': 0.2}), 0.1696666)],
        'Sample': [ErrorModel(ENAR(seed=seeds[1]), error_type.Outlier({'outlier_coefficient': 0.1}), 0.1696666)],
        'MeasureCode': [ErrorModel(EAR(condition_to_column='Condition', seed=seeds[2]), error_type.Mislabel(), 0.1696666)],
    }
    enar_realistic = MidLevelConfig(mid_lvl_config_enar)
    df_corrupted, error_mask = mid_level.create_errors(df_clean_realistic_assumptions, enar_realistic)

    # serialize numbers when saving
    serialize_strings(df_corrupted).to_csv(f'../export_data/hospital/hospital_enar_realistic_{i}.csv', index=False)

# serialize numbers when saving
serialize_strings(df_clean_realistic_assumptions).to_csv('../export_data/hospital/hospital_enar_realistic_clean.csv', index=False)