In [84]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
import numpy as np
import pandas as pd
from collections import Counter

from utils import get_differences, read_csv_dataset, create_mcar

In [86]:
from tab_err import ErrorModel, error_type
from tab_err.error_mechanism import EAR, ENAR, ECAR
from tab_err.error_type import ErrorTypeConfig
from tab_err.api import mid_level, MidLevelConfig

## beers

#### Loading & Cleaning

In [131]:
df_clean = read_csv_dataset('../data/beers/clean.csv')
df_dirty = read_csv_dataset('../data/beers/dirty.csv')

df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['ounces'] = df_clean['ounces'].astype('category')
df_clean['city'] = df_clean['city'].astype('category')
df_clean['state'] = df_clean['state'].astype('category')

df_clean['abv'] = df_clean['abv'].replace('', np.nan)
df_clean['ibu'] = df_clean['ibu'].replace('', np.nan)

df_clean['abv'] = df_clean['abv'].astype('float64')
df_clean['ibu'] = df_clean['ibu'].astype('float64')

#### MCAR

In [132]:
create_mcar('beers', df_clean, error_percentages)

Saved MCAR dataset beers iteration 0
Saved MCAR dataset beers iteration 1
Saved MCAR dataset beers iteration 2
Saved MCAR dataset beers iteration 3
Saved MCAR dataset beers iteration 4
Saved MCAR dataset beers iteration 5
Saved MCAR dataset beers iteration 6
Saved MCAR dataset beers iteration 7
Saved MCAR dataset beers iteration 8
Saved MCAR dataset beers iteration 9


#### Scenario

In [133]:
scenario_name = 'beers_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + 1, base_seed + 2, base_seed + 3, base_seed + 4, base_seed + 5]
    
    mid_lvl_config = {
        'ounces': [ErrorModel(ECAR(seed=seeds[0]), error_type.CategorySwap(), error_percentages['ounces'])],  # ECAR because error_percentage is 1.0 in 'ounces', mechanism doesn't matter
        'abv': [ErrorModel(EAR(condition_to_column='brewery-name', seed=seeds[1]), error_type.Outlier(), error_percentages['abv'])],
        'ibu': [ErrorModel(EAR(condition_to_column='brewery-name', seed=seeds[2]), error_type.Outlier(), error_percentages['ibu'])],
        'city': [ErrorModel(EAR(condition_to_column='brewery-name', seed=seeds[3]), error_type.CategorySwap(), error_percentages['city'])],
        'state': [ErrorModel(EAR(condition_to_column='brewery-name', seed=seeds[4]), error_type.CategorySwap(), error_percentages['state'])],
    }
    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/beers/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/beers/{scenario_name}_clean.csv', index=False)

## bridges

In [134]:
df_clean = read_csv_dataset('../data/bridges/clean.csv')
df_dirty = read_csv_dataset('../data/bridges/bridges_3_1.csv')


df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['ERECTED'] = df_clean['ERECTED'].replace('?', np.nan)
df_clean['LENGTH'] = df_clean['LENGTH'].replace('?', np.nan)
df_clean["ERECTED"] = df_clean["ERECTED"].astype('Int64')
df_clean["LENGTH"] = df_clean["LENGTH"].astype('Int64')

df_clean["LOCATION"] = df_clean["LOCATION"].astype('category')
df_clean["PURPOSE"] = df_clean["PURPOSE"].astype('category')
df_clean["LANES"] = df_clean["LANES"].astype('category')
df_clean["CLEAR-G"] = df_clean["CLEAR-G"].astype('string')
df_clean["T-OR-D"] = df_clean["T-OR-D"].astype('category')
df_clean["MATERIAL"] = df_clean["MATERIAL"].astype('category')
df_clean["SPAN"] = df_clean["SPAN"].astype('category')
df_clean["REL-L"] = df_clean["REL-L"].astype('category')
df_clean["TYPE"] = df_clean["TYPE"].astype('category')


create_mcar('bridges', df_clean, error_percentages)

Saved MCAR dataset bridges iteration 0
Saved MCAR dataset bridges iteration 1
Saved MCAR dataset bridges iteration 2
Saved MCAR dataset bridges iteration 3
Saved MCAR dataset bridges iteration 4
Saved MCAR dataset bridges iteration 5
Saved MCAR dataset bridges iteration 6
Saved MCAR dataset bridges iteration 7
Saved MCAR dataset bridges iteration 8
Saved MCAR dataset bridges iteration 9


In [135]:
scenario_name = 'bridges_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "LOCATION": [ErrorModel(ENAR(seed=seeds[0]), error_type.CategorySwap(), error_percentages['LOCATION'])],
        "ERECTED": [ErrorModel(EAR(condition_to_column='LOCATION', seed=seeds[1]), error_type.Outlier(), error_percentages['ERECTED'])],
        "PURPOSE": [ErrorModel(EAR(condition_to_column='LOCATION', seed=seeds[2]), error_type.Extraneous({'extraneous_value_template': 'Purpose: {value}'}), error_percentages['PURPOSE'])],
        "LENGTH": [ErrorModel(ENAR(seed=seeds[3]), error_type.Outlier(), error_percentages['LENGTH'])],
        "LANES": [ErrorModel(EAR(condition_to_column='PURPOSE', seed=seeds[4]), error_type.CategorySwap(), error_percentages['LANES'])],
        "CLEAR-G": [ErrorModel(ENAR(seed=seeds[5]), error_type.MissingValue({'missing_value': 'N/A'}), error_percentages['CLEAR-G'])],
        "T-OR-D": [ErrorModel(EAR(condition_to_column='PURPOSE', seed=seeds[6]), error_type.CategorySwap(), error_percentages['T-OR-D'])],
        "MATERIAL": [ErrorModel(EAR(condition_to_column='ERECTED', seed=seeds[7]), error_type.CategorySwap(), error_percentages['MATERIAL'])],
        "SPAN": [ErrorModel(EAR(condition_to_column='PURPOSE', seed=seeds[8]), error_type.CategorySwap(), error_percentages['SPAN'])],
        "REL-L": [ErrorModel(EAR(condition_to_column='PURPOSE', seed=seeds[9]), error_type.CategorySwap(), error_percentages['REL-L'])],
        "TYPE": [ErrorModel(EAR(condition_to_column='PURPOSE', seed=seeds[10]), error_type.CategorySwap(), error_percentages['TYPE'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/bridges/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/bridges/{scenario_name}_clean.csv', index=False)

## cars

In [137]:
df_clean = read_csv_dataset('../data/cars/clean.csv')
df_dirty = read_csv_dataset('../data/cars/cars_3_1.csv')

df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['Cylinders'] = df_clean['Cylinders'].astype('category')
df_clean['Acceleration'] = df_clean['Acceleration'].astype('category')
df_clean['Model'] = df_clean['Model'].astype('category')
df_clean['Origin'] = df_clean['Origin'].astype('category')
df_clean['Displacement'] = df_clean['Displacement'].astype('Float64')
df_clean['Horsepower'] = df_clean['Horsepower'].astype('Float64')
df_clean['Weight'] = df_clean['Weight'].astype('Float64')

create_mcar('cars', df_clean, error_percentages)

Saved MCAR dataset cars iteration 0
Saved MCAR dataset cars iteration 1
Saved MCAR dataset cars iteration 2
Saved MCAR dataset cars iteration 3
Saved MCAR dataset cars iteration 4
Saved MCAR dataset cars iteration 5
Saved MCAR dataset cars iteration 6
Saved MCAR dataset cars iteration 7
Saved MCAR dataset cars iteration 8
Saved MCAR dataset cars iteration 9


In [138]:
df_clean.head()

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,Chevrolet Chevelle Malibu,18.0,8,307.0,130.0,3504.0,12.0,70,US
1,Buick Skylark 320,15.0,8,350.0,165.0,3693.0,11.5,70,US
2,Plymouth Satellite,18.0,8,318.0,150.0,3436.0,11.0,70,US
3,AMC Rebel SST,16.0,8,304.0,150.0,3433.0,12.0,70,US
4,Ford Torino,17.0,8,302.0,140.0,3449.0,10.5,70,US


In [139]:
scenario_name = 'cars_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "Cylinders": [ErrorModel(ENAR(seed=seeds[0]), error_type.CategorySwap(), error_percentages['Cylinders'])],
        "Displacement": [ErrorModel(EAR(condition_to_column='Horsepower', seed=seeds[1]), error_type.Outlier(), error_percentages['Displacement'])],
        "Horsepower": [ErrorModel(ENAR(seed=seeds[2]), error_type.Outlier(), error_percentages['Horsepower'])],
        "Weight": [ErrorModel(EAR(condition_to_column="Horsepower", seed=seeds[3]), error_type.Outlier(), error_percentages['Weight'])],
        "Acceleration": [ErrorModel(EAR(condition_to_column="Horsepower", seed=seeds[4]), error_type.CategorySwap(), error_percentages['Acceleration'])],
        "Model": [ErrorModel(EAR(condition_to_column="Origin", seed=seeds[5]), error_type.CategorySwap(), error_percentages['Model'])],
        "Origin": [ErrorModel(ENAR(seed=seeds[6]), error_type.CategorySwap(), error_percentages['Origin'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/cars/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/cars/{scenario_name}_clean.csv', index=False)

## flights

In [146]:
df_clean = read_csv_dataset('../data/flights/clean.csv')
df_dirty = read_csv_dataset('../data/flights/dirty.csv')

df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['sched_dep_time'] = df_clean['sched_dep_time'].astype('category')
df_clean['act_dep_time'] = df_clean['act_dep_time'].astype('category')
df_clean['sched_arr_time'] = df_clean['sched_arr_time'].astype('category')
df_clean['act_arr_time'] = df_clean['act_arr_time'].astype('category')

create_mcar('flights', df_clean, error_percentages)

Saved MCAR dataset flights iteration 0
Saved MCAR dataset flights iteration 1
Saved MCAR dataset flights iteration 2
Saved MCAR dataset flights iteration 3
Saved MCAR dataset flights iteration 4
Saved MCAR dataset flights iteration 5
Saved MCAR dataset flights iteration 6
Saved MCAR dataset flights iteration 7
Saved MCAR dataset flights iteration 8
Saved MCAR dataset flights iteration 9


In [148]:
scenario_name = 'flights_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "sched_dep_time": [ErrorModel(EAR(condition_to_column='flight', seed=seeds[0]), error_type.CategorySwap(), error_percentages['sched_dep_time'])],
        "act_dep_time": [ErrorModel(EAR(condition_to_column='flight', seed=seeds[1]), error_type.CategorySwap(), error_percentages['act_dep_time'])],
        "sched_arr_time": [ErrorModel(EAR(condition_to_column='flight', seed=seeds[2]), error_type.CategorySwap(), error_percentages['sched_arr_time'])],
        "act_arr_time": [ErrorModel(EAR(condition_to_column='flight', seed=seeds[3]), error_type.CategorySwap(), error_percentages['act_arr_time'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/flights/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/flights/{scenario_name}_clean.csv', index=False)

## food

In [165]:
df_clean = read_csv_dataset('../data/food/clean.csv').iloc[:50000, :]
df_dirty = read_csv_dataset('../data/food/dirty.csv').iloc[:50000, :]

df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['city'] = df_clean['city'].astype('category')
df_clean['state'] = df_clean['state'].astype('string')
df_clean['facilitytype'] = df_clean['facilitytype'].astype('category')
df_clean['address'] = df_clean['address'].astype('category')
df_clean['zip'] = df_clean['zip'].astype('category')

create_mcar('food', df_clean, error_percentages)

Saved MCAR dataset food iteration 0
Saved MCAR dataset food iteration 1
Saved MCAR dataset food iteration 2
Saved MCAR dataset food iteration 3
Saved MCAR dataset food iteration 4
Saved MCAR dataset food iteration 5
Saved MCAR dataset food iteration 6
Saved MCAR dataset food iteration 7
Saved MCAR dataset food iteration 8
Saved MCAR dataset food iteration 9


In [166]:
scenario_name = 'food_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "city": [ErrorModel(EAR(condition_to_column='inspectiontype', seed=seeds[0]), error_type.CategorySwap(), error_percentages['city'])],
        "state": [ErrorModel(EAR(condition_to_column='inspectiontype', seed=seeds[1]), error_type.Replace({'replace_what': 'IL', 'replace_with': 'WS'}), error_percentages['state'])],
        "facilitytype": [ErrorModel(EAR(condition_to_column='inspectiontype', seed=seeds[2]), error_type.CategorySwap(), error_percentages['facilitytype'])],
        "address": [ErrorModel(EAR(condition_to_column='inspectiontype', seed=seeds[3]), error_type.CategorySwap(), error_percentages['address'])],
        "zip": [ErrorModel(EAR(condition_to_column='inspectiontype', seed=seeds[4]), error_type.CategorySwap(), error_percentages['zip'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/food/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/food/{scenario_name}_clean.csv', index=False)

## rayyan

In [192]:
df_clean = read_csv_dataset('../data/rayyan/clean.csv')
df_dirty = read_csv_dataset('../data/rayyan/dirty.csv')

df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['article_title'] = df_clean['article_title'].astype('string')
df_clean['journal_title'] = df_clean['journal_title'].astype('string')
df_clean['journal_issn'] = df_clean['journal_issn'].astype('string')
df_clean['article_jvolumn'] = df_clean['article_jvolumn'].astype('category')
df_clean['article_jissue'] = df_clean['article_jissue'].astype('category')
df_clean['article_jcreated_at'] = df_clean['article_jcreated_at'].astype('category')
df_clean['article_pagination'] = df_clean['article_pagination'].astype('string')
df_clean['author_list'] = df_clean['author_list'].astype('category')


create_mcar('rayyan', df_clean, error_percentages)

Saved MCAR dataset rayyan iteration 0
Saved MCAR dataset rayyan iteration 1
Saved MCAR dataset rayyan iteration 2
Saved MCAR dataset rayyan iteration 3
Saved MCAR dataset rayyan iteration 4
Saved MCAR dataset rayyan iteration 5
Saved MCAR dataset rayyan iteration 6
Saved MCAR dataset rayyan iteration 7
Saved MCAR dataset rayyan iteration 8
Saved MCAR dataset rayyan iteration 9


In [193]:
scenario_name = 'rayyan_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "article_title": [ErrorModel(EAR(condition_to_column='journal_title', seed=seeds[0]), error_type.MissingValue({'missing_value': '?'}), error_percentages['article_title'])],
        "journal_title": [ErrorModel(ENAR(seed=seeds[1]), error_type.MissingValue({'missing_value': '?'}), error_percentages['journal_title'])],
        "journal_issn": [ErrorModel(EAR(condition_to_column='journal_issn', seed=seeds[2]), error_type.MissingValue({'missing_value': '?'}), error_percentages['journal_issn'])],
        "article_jvolumn": [ErrorModel(EAR(condition_to_column='journal_title', seed=seeds[3]), error_type.CategorySwap(), error_percentages['article_jvolumn'])],
        "article_jissue": [ErrorModel(EAR(condition_to_column='journal_title', seed=seeds[4]), error_type.CategorySwap(), error_percentages['article_jissue'])],
        "article_jcreated_at": [ErrorModel(EAR(condition_to_column='journal_title', seed=seeds[5]), error_type.CategorySwap(), error_percentages['article_jcreated_at'])],
        "article_pagination": [ErrorModel(EAR(condition_to_column='article_pagination', seed=seeds[6]), error_type.MissingValue({'missing_value': '?'}), error_percentages['article_pagination'])],
        "author_list": [ErrorModel(EAR(condition_to_column='article_pagination', seed=seeds[7]), error_type.CategorySwap(), error_percentages['author_list'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/rayyan/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/rayyan/{scenario_name}_clean.csv', index=False)

## restaurant

In [195]:
df_clean = read_csv_dataset('../data/restaurant/clean.csv')
df_dirty = read_csv_dataset('../data/restaurant/restaurant_3_1.csv')

df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['city'] = df_clean['city'].astype('string')
df_clean['phone'] = df_clean['phone'].astype('string')
df_clean['type'] = df_clean['type'].astype('category')
df_clean['class'] = df_clean['class'].astype('category')

create_mcar('restaurant', df_clean, error_percentages)

Saved MCAR dataset restaurant iteration 0
Saved MCAR dataset restaurant iteration 1
Saved MCAR dataset restaurant iteration 2
Saved MCAR dataset restaurant iteration 3
Saved MCAR dataset restaurant iteration 4
Saved MCAR dataset restaurant iteration 5
Saved MCAR dataset restaurant iteration 6
Saved MCAR dataset restaurant iteration 7
Saved MCAR dataset restaurant iteration 8
Saved MCAR dataset restaurant iteration 9


In [196]:
error_percentages

name     0.000000
addr     0.000000
city     0.037037
phone    0.045139
type     0.062500
class    0.034722
dtype: float64

In [197]:
df_clean.head()

Unnamed: 0,name,addr,city,phone,type,class
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,310/246-1501,american,0
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,310-246-1501,steakhouses,0
2,art's delicatessen,12224 ventura blvd.,studio city,818/762-1221,american,1
3,art's deli,12224 ventura blvd.,studio city,818-762-1221,delis,1
4,hotel bel-air,701 stone canyon rd.,bel air,310/472-1211,californian,2


In [198]:
scenario_name = 'restaurant_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "city": [ErrorModel(EAR(condition_to_column='class', seed=seeds[0]), error_type.MissingValue({'missing_value': '?'}), error_percentages['city'])],
        "phone": [ErrorModel(EAR(condition_to_column='class', seed=seeds[1]), error_type.Permutate({'permutation_separator': '-'}), error_percentages['phone'])],
        "type": [ErrorModel(EAR(condition_to_column='class', seed=seeds[2]), error_type.CategorySwap(), error_percentages['type'])],
        "class": [ErrorModel(ENAR(seed=seeds[3]), error_type.CategorySwap(), error_percentages['class'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/restaurant/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/restaurant/{scenario_name}_clean.csv', index=False)

## tax

In [252]:
df_clean = read_csv_dataset('../data/tax/clean.csv')
df_dirty = read_csv_dataset('../data/tax/dirty.csv')

df_clean = df_clean.iloc[:50000, :]
df_dirty = df_dirty.iloc[:50000, :]

df_clean_str = df_clean.copy().astype(str)
df_dirty_str = df_dirty.copy().astype(str)

error_percentages = (df_dirty_str != df_clean_str).sum() / df_dirty_str.shape[0]

df_clean['state'] = df_clean['state'].astype('category')
df_clean['zip'] = df_clean['zip'].astype('category')
df_clean['marital_status'] = df_clean['marital_status'].astype('category')
df_clean['has_child'] = df_clean['has_child'].astype('category')
df_clean['rate'] = df_clean['rate'].astype('float64')


create_mcar('tax', df_clean, error_percentages)

Saved MCAR dataset tax iteration 0
Saved MCAR dataset tax iteration 1
Saved MCAR dataset tax iteration 2
Saved MCAR dataset tax iteration 3
Saved MCAR dataset tax iteration 4
Saved MCAR dataset tax iteration 5
Saved MCAR dataset tax iteration 6
Saved MCAR dataset tax iteration 7
Saved MCAR dataset tax iteration 8
Saved MCAR dataset tax iteration 9


In [253]:
scenario_name = 'tax_scenario'

for i in range(1):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "f_name": [ErrorModel(EAR(condition_to_column='city', seed=seeds[0]), error_type.MissingValue({'missing_value': '?'}), error_percentages['f_name'])],
        "l_name": [ErrorModel(EAR(condition_to_column='city', seed=seeds[1]), error_type.MissingValue({'missing_value': '?'}), error_percentages['l_name'])],
        "state": [ErrorModel(EAR(condition_to_column='city', seed=seeds[2]), error_type.CategorySwap({'mislabel_weighing': 'frequency'}), error_percentages['state'])],
        "zip": [ErrorModel(ENAR(seed=seeds[3]), error_type.CategorySwap({'mislabel_weighing': 'frequency'}), error_percentages['zip'])],
        "marital_status": [ErrorModel(ENAR(seed=seeds[4]), error_type.CategorySwap({'mislabel_weighing': 'frequency'}), error_percentages['marital_status'])],
        "has_child": [ErrorModel(ENAR(seed=seeds[5]), error_type.CategorySwap({'mislabel_weighing': 'frequency'}), error_percentages['has_child'])],
        "rate": [ErrorModel(ENAR(seed=seeds[6]), error_type.MissingValue(), error_percentages['rate'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/tax/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/tax/{scenario_name}_clean.csv', index=False)

In [236]:
scenario_name = 'tax_scenario'

for i in range(10):
    base_seed = i * 1000  # Using 1000 ensures no overlap between iterations
    seeds = [base_seed + (j+1) for j in range(len(df_clean.columns))]
    
    mid_lvl_config = {
        "f_name": [ErrorModel(EAR(condition_to_column='city', seed=seeds[0]), error_type.MissingValue({'missing_value': '?'}), error_percentages['f_name'])],
        "l_name": [ErrorModel(EAR(condition_to_column='city', seed=seeds[1]), error_type.MissingValue({'missing_value': '?'}), error_percentages['l_name'])],
        "state": [ErrorModel(EAR(condition_to_column='city', seed=seeds[2]), error_type.MissingValue({'missing_value': '?'}), error_percentages['state'])],
        "zip": [ErrorModel(ENAR(seed=seeds[3]), error_type.MissingValue({'missing_value': '?'}), error_percentages['zip'])],
        "marital_status": [ErrorModel(ENAR(seed=seeds[4]), error_type.MissingValue({'missing_value': '?'}), error_percentages['marital_status'])],
        "has_child": [ErrorModel(ENAR(seed=seeds[5]), error_type.MissingValue({'missing_value': '?'}), error_percentages['has_child'])],
        "rate": [ErrorModel(ENAR(seed=seeds[6]), error_type.MissingValue({'missing_value': '?'}), error_percentages['rate'])],
    }

    scenario = MidLevelConfig(mid_lvl_config)
    df_corrupted, error_mask = mid_level.create_errors(df_clean, scenario)
    df_corrupted.to_csv(f'../export_data/tax/{scenario_name}_{i}.csv', index=False)
df_clean_str.to_csv(f'../export_data/tax/{scenario_name}_clean.csv', index=False)

## QA

In [254]:
df_clean = read_csv_dataset('../export_data/tax/tax_scenario_0.csv').astype(str).iloc[:50000, :]
df_dirty = read_csv_dataset('../export_data/tax/tax_scenario_clean.csv').astype(str).iloc[:50000, :]

df_clean_orig = read_csv_dataset('../data/tax/clean.csv').astype(str).iloc[:50000, :]
df_dirty_orig = read_csv_dataset('../data/tax/dirty.csv').astype(str).iloc[:50000, :]

In [255]:
(df_clean_orig != df_dirty_orig).sum().sum()

np.int64(30757)

In [256]:
(df_clean != df_dirty).sum().sum()

np.int64(44216)