# Test of the High-Level API

For development purposes

In [161]:
%load_ext autoreload
%autoreload 2
from __future__ import annotations

import pandas as pd

from tab_err import error_type
from tab_err.api import high_level


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [162]:
def show_result(original_df: pd.DataFrame, perturbed_df: pd.DataFrame, error_mask: pd.DataFrame | None = None) -> pd.DataFrame:
    """Simple helper function to show DataFrames after perturbing them."""
    return (
        pd.concat([original_df, perturbed_df], keys=["original", "perturbed"], axis=1)
        if error_mask is None
        else pd.concat([original_df, perturbed_df, error_mask], keys=["original", "perturbed", "error_mask"], axis=1)
    )

In [163]:
df_typist_book_title = pd.DataFrame(
    {
        "typist": ["Alice", "Alice", "Alice", "Bob", "Bob", "Bob"],
        "book_title": ["To Kill a Mockingbird", "1984", "Pride and Prejudice", "The Great Gatsby", "Moby-Dick", "The Catcher in the Rye"],
        "rating": [1.0, 3.0, 3.0, 4.0, 2.0, 1.0],
    }
)



print(df_typist_book_title.dtypes)

typist         object
book_title     object
rating        float64
dtype: object


### TEST

In [164]:
df_corrupted, error_mask = high_level.create_errors(df_typist_book_title, 0.5)
show_result(df_typist_book_title, df_corrupted, error_mask)

Column-type dict:  {'typist': [<tab_err.error_type._extraneous.Extraneous object at 0x7f6061876110>, <tab_err.error_type._mojibake.Mojibake object at 0x7f6061874f70>, <tab_err.error_type._replace.Replace object at 0x7f6061876ad0>, <tab_err.error_type._typo.Typo object at 0x7f6061875a80>, <tab_err.error_type._missing.MissingValue object at 0x7f608633c820>], 'book_title': [<tab_err.error_type._extraneous.Extraneous object at 0x7f6061876110>, <tab_err.error_type._mojibake.Mojibake object at 0x7f6061874f70>, <tab_err.error_type._replace.Replace object at 0x7f6061876ad0>, <tab_err.error_type._typo.Typo object at 0x7f6061875a80>, <tab_err.error_type._missing.MissingValue object at 0x7f608633c820>], 'rating': [<tab_err.error_type._add_delta.AddDelta object at 0x7f60617933a0>, <tab_err.error_type._outlier.Outlier object at 0x7f6061875390>, <tab_err.error_type._wrong_unit.WrongUnit object at 0x7f60618757e0>, <tab_err.error_type._missing.MissingValue object at 0x7f608633c820>]}
Column-mech dict:

  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)


Unnamed: 0_level_0,original,original,original,perturbed,perturbed,perturbed,error_mask,error_mask,error_mask
Unnamed: 0_level_1,typist,book_title,rating,typist,book_title,rating,typist,book_title,rating
0,Alice,To Kill a Mockingbird,1.0,Alice,To Kill a Mockingbird,1.0,False,False,False
1,Alice,1984,3.0,Alice,1984,3.0,False,False,False
2,Alice,Pride and Prejudice,3.0,Alice,Pride and Prejudice,3.0,False,False,False
3,Bob,The Great Gatsby,4.0,Bob,The Great Gatsby,4.0,False,False,False
4,Bob,Moby-Dick,2.0,Bob,Moby-Dick,2.0,False,False,False
5,Bob,The Catcher in the Rye,1.0,Bob,The Catcher in the Rye,1.0,False,False,False


# Testing of High Level API with a large dataset

In [165]:
# Read in data
df_clean = pd.read_csv("./clean_beers.csv")
df_clean.head()

Unnamed: 0,index,id,beer_name,style,ounces,abv,ibu,brewery_id,brewery_name,city,state
0,1,1436,Pub Beer,American Pale Lager,12.0,0.05,,408,10 Barrel Brewing Company,Bend,OR
1,2,2265,Devil's Cup,American Pale Ale (APA),12.0,0.066,,177,18th Street Brewery,Gary,IN
2,3,2264,Rise of the Phoenix,American IPA,12.0,0.071,,177,18th Street Brewery,Gary,IN
3,4,2263,Sinister,American Double / Imperial IPA,12.0,0.09,,177,18th Street Brewery,Gary,IN
4,5,2262,Sex and Candy,American IPA,12.0,0.075,,177,18th Street Brewery,Gary,IN


In [166]:
df_clean = df_clean.drop(columns = ["index", "id", "ounces", "ibu", "brewery_id", "city"])
df_clean.head()

Unnamed: 0,beer_name,style,abv,brewery_name,state
0,Pub Beer,American Pale Lager,0.05,10 Barrel Brewing Company,OR
1,Devil's Cup,American Pale Ale (APA),0.066,18th Street Brewery,IN
2,Rise of the Phoenix,American IPA,0.071,18th Street Brewery,IN
3,Sinister,American Double / Imperial IPA,0.09,18th Street Brewery,IN
4,Sex and Candy,American IPA,0.075,18th Street Brewery,IN


In [167]:
# Add a date time column, just all the same date
df_clean["arbitrary_date"] = pd.to_datetime("2025-02-20 14:30:00")
df_clean.head()

Unnamed: 0,beer_name,style,abv,brewery_name,state,arbitrary_date
0,Pub Beer,American Pale Lager,0.05,10 Barrel Brewing Company,OR,2025-02-20 14:30:00
1,Devil's Cup,American Pale Ale (APA),0.066,18th Street Brewery,IN,2025-02-20 14:30:00
2,Rise of the Phoenix,American IPA,0.071,18th Street Brewery,IN,2025-02-20 14:30:00
3,Sinister,American Double / Imperial IPA,0.09,18th Street Brewery,IN,2025-02-20 14:30:00
4,Sex and Candy,American IPA,0.075,18th Street Brewery,IN,2025-02-20 14:30:00


In [168]:
df_clean["beer_name"] = df_clean["beer_name"].astype(str)
df_clean["style"] = df_clean["style"].astype(str)
df_clean["brewery_name"] = df_clean["brewery_name"].astype(str)
df_clean["state"] = df_clean["state"].astype(str)
df_clean.dtypes

beer_name                 object
style                     object
abv                      float64
brewery_name              object
state                     object
arbitrary_date    datetime64[ns]
dtype: object

In [169]:
# No error type spec
df_dirty, error_mask = high_level.create_errors(df_clean, max_error_rate=0.75)
show_result(df_clean, df_dirty, error_mask)

Column-type dict:  {'beer_name': [<tab_err.error_type._extraneous.Extraneous object at 0x7f605beef340>, <tab_err.error_type._mojibake.Mojibake object at 0x7f605beefc10>, <tab_err.error_type._replace.Replace object at 0x7f605beef910>, <tab_err.error_type._typo.Typo object at 0x7f605beecd60>, <tab_err.error_type._missing.MissingValue object at 0x7f605beee470>], 'style': [<tab_err.error_type._extraneous.Extraneous object at 0x7f605beef340>, <tab_err.error_type._mojibake.Mojibake object at 0x7f605beefc10>, <tab_err.error_type._replace.Replace object at 0x7f605beef910>, <tab_err.error_type._typo.Typo object at 0x7f605beecd60>, <tab_err.error_type._missing.MissingValue object at 0x7f605beee470>], 'abv': [<tab_err.error_type._add_delta.AddDelta object at 0x7f605beef9d0>, <tab_err.error_type._outlier.Outlier object at 0x7f605beef0a0>, <tab_err.error_type._wrong_unit.WrongUnit object at 0x7f605beefdc0>, <tab_err.error_type._missing.MissingValue object at 0x7f605beee470>], 'brewery_name': [<tab_

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)


Unnamed: 0_level_0,original,original,original,original,original,original,perturbed,perturbed,perturbed,perturbed,perturbed,perturbed,error_mask,error_mask,error_mask,error_mask,error_mask,error_mask
Unnamed: 0_level_1,beer_name,style,abv,brewery_name,state,arbitrary_date,beer_name,style,abv,brewery_name,state,arbitrary_date,beer_name,style,abv,brewery_name,state,arbitrary_date
0,Pub Beer,American Pale Lager,0.050,10 Barrel Brewing Company,OR,2025-02-20 14:30:00,Pub Beer,American Pale Lager,0.500000,10 Barrel Brewing Company,OF,NaT,False,False,True,False,True,True
1,Devil's Cup,American Pale Ale (APA),0.066,18th Street Brewery,IN,2025-02-20 14:30:00,~^Devil's Cup@+,American Pale Ale (APA),0.066000,18th Street Brewery,IN,2025-02-20 14:30:00.000000000,True,False,False,False,True,False
2,Rise of the Phoenix,American IPA,0.071,18th Street Brewery,IN,2025-02-20 14:30:00,~^Rise of the Phoenix@+,American IPA,-1.167647,8th Street Brewery,IN,2025-02-20 14:30:00.000000000,True,False,True,True,False,True
3,Sinister,American Double / Imperial IPA,0.090,18th Street Brewery,IN,2025-02-20 14:30:00,~^Sinister@+,,0.090000,18th Street Brewery,IN,2025-02-20 14:30:00.000000000,True,True,False,False,True,True
4,Sex and Candy,American IPA,0.075,18th Street Brewery,IN,2025-02-20 14:30:00,~^Sex and Candy@+,American IPA,0.075000,18th Street Brewery,IN,2025-02-20 14:30:00.000000000,True,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2405,Belgorado,Belgian IPA,0.067,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,~^Belgorado@+,Belgian IPA,-1.171647,Wynkoop Brewing Company,CO,2025-02-20 14:30:00.000000000,True,True,True,True,True,False
2406,Rail Yard Ale,American Amber / Red Ale,0.052,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,Rail Yard Ale,American Amber / Red Ale,-1.186647,Wynkoop Brewing Company,CL,2025-02-20 14:30:00.000000000,False,True,True,True,True,False
2407,B3K Black Lager,Schwarzbier,0.055,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,B3K Black Lager,Schwarzbier,-1.183647,Wynkoop Brewing Company,CO,2061-01-07 20:47:59.946644480,False,False,True,True,False,True
2408,Silverback Pale Ale,American Pale Ale (APA),0.055,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,~^Silverback Pale Ale@+,American Pale Ale (APA),0.055000,Wynkoop Brewing Company,CO,NaT,True,False,False,True,False,True


### Test of include/exlcude error types

In [175]:
# both error type spec -- should throw error -- Good!
#df_dirty, error_mask = high_level.create_errors(df_clean, max_error_rate=0.75, error_types_to_exclude=[error_type.AddDelta()], error_types_to_include=[error_type.AddDelta()])
#show_result(df_clean, df_dirty, error_mask)

In [176]:
# Error types to exclude - well formed
df_dirty, error_mask = high_level.create_errors(df_clean, max_error_rate=0.75, error_types_to_exclude=[error_type.MissingValue()])
show_result(df_clean, df_dirty, error_mask)

Column-type dict:  {'beer_name': [<tab_err.error_type._extraneous.Extraneous object at 0x7f608633c820>, <tab_err.error_type._mojibake.Mojibake object at 0x7f606135c670>, <tab_err.error_type._replace.Replace object at 0x7f605beefdf0>, <tab_err.error_type._typo.Typo object at 0x7f605beece80>], 'style': [<tab_err.error_type._extraneous.Extraneous object at 0x7f608633c820>, <tab_err.error_type._mojibake.Mojibake object at 0x7f606135c670>, <tab_err.error_type._replace.Replace object at 0x7f605beefdf0>, <tab_err.error_type._typo.Typo object at 0x7f605beece80>], 'abv': [<tab_err.error_type._add_delta.AddDelta object at 0x7f6060b67220>, <tab_err.error_type._outlier.Outlier object at 0x7f605beefc70>, <tab_err.error_type._wrong_unit.WrongUnit object at 0x7f605beefd00>], 'brewery_name': [<tab_err.error_type._extraneous.Extraneous object at 0x7f608633c820>, <tab_err.error_type._mojibake.Mojibake object at 0x7f606135c670>, <tab_err.error_type._replace.Replace object at 0x7f605beefdf0>, <tab_err.err

  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)
  return self._apply(data, error_mask, column)


Unnamed: 0_level_0,original,original,original,original,original,original,perturbed,perturbed,perturbed,perturbed,perturbed,perturbed,error_mask,error_mask,error_mask,error_mask,error_mask,error_mask
Unnamed: 0_level_1,beer_name,style,abv,brewery_name,state,arbitrary_date,beer_name,style,abv,brewery_name,state,arbitrary_date,beer_name,style,abv,brewery_name,state,arbitrary_date
0,Pub Beer,American Pale Lager,0.050,10 Barrel Brewing Company,OR,2025-02-20 14:30:00,Pug Beer,American Pale Lafer,0.050000,10 Barrel Brewing Company,0R,2025-02-20 14:30:00,True,True,False,False,True,True
1,Devil's Cup,American Pale Ale (APA),0.066,18th Street Brewery,IN,2025-02-20 14:30:00,Devil's Cup,American Pale Ale (APA),0.660000,18th Street Brewery,IN,2025-02-20 14:30:00,False,False,True,True,True,True
2,Rise of the Phoenix,American IPA,0.071,18th Street Brewery,IN,2025-02-20 14:30:00,RiseofthePhoenix,American ILA,0.071000,18thStreetBrewery,IN,2025-02-20 14:30:00,True,True,False,True,True,True
3,Sinister,American Double / Imperial IPA,0.090,18th Street Brewery,IN,2025-02-20 14:30:00,Sinister,Anerican Double / Imperial IPA,0.090000,18th Street Brewery,IN,2025-02-20 14:30:00,True,True,False,True,False,True
4,Sex and Candy,American IPA,0.075,18th Street Brewery,IN,2025-02-20 14:30:00,Sex and Candy,American IPA,0.159713,18th Street B5ewery,IN,2025-02-20 14:30:00,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2405,Belgorado,Belgian IPA,0.067,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,[Belgorado,Belgian IPA,0.670000,[Wynkoop Brewing Company,CO,2025-02-20 14:30:00,True,False,True,True,True,True
2406,Rail Yard Ale,American Amber / Red Ale,0.052,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,[Rail Yard Ale,American Amber / Red Ale,0.477383,Wynkoop Brewing C0mpany,[CO,2025-02-20 14:30:00,True,True,True,True,True,True
2407,B3K Black Lager,Schwarzbier,0.055,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,B3K Black Lager,Schwarzbier,0.550000,Wynkoop Bresing Company,CO,2025-02-20 14:30:00,False,True,True,True,False,False
2408,Silverback Pale Ale,American Pale Ale (APA),0.055,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,Silverback Pale Ale,American Pale Ale (APA),0.193573,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,False,True,True,False,False,True


In [177]:
# error types to include - well formed
df_dirty, error_mask = high_level.create_errors(df_clean, max_error_rate=0.75, error_types_to_include=[error_type.AddDelta()])
show_result(df_clean, df_dirty, error_mask)

Column-type dict:  {'beer_name': [], 'style': [], 'abv': [<tab_err.error_type._add_delta.AddDelta object at 0x7f6061a6fa00>], 'brewery_name': [], 'state': [], 'arbitrary_date': [<tab_err.error_type._add_delta.AddDelta object at 0x7f6061a6fa00>]}
Column-mech dict:  {'beer_name': [<tab_err.error_mechanism._enar.ENAR object at 0x7f6061384760>, <tab_err.error_mechanism._ecar.ECAR object at 0x7f6061384820>, <tab_err.error_mechanism._ear.EAR object at 0x7f6061384130>, <tab_err.error_mechanism._ear.EAR object at 0x7f6061385c30>, <tab_err.error_mechanism._ear.EAR object at 0x7f6061386a10>, <tab_err.error_mechanism._ear.EAR object at 0x7f60613861a0>, <tab_err.error_mechanism._ear.EAR object at 0x7f6061386a70>], 'style': [<tab_err.error_mechanism._enar.ENAR object at 0x7f6061384760>, <tab_err.error_mechanism._ecar.ECAR object at 0x7f6061384820>, <tab_err.error_mechanism._ear.EAR object at 0x7f6061384d90>, <tab_err.error_mechanism._ear.EAR object at 0x7f60613850c0>, <tab_err.error_mechanism._ear.

  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  return self._apply(data, error_mask, column)


Unnamed: 0_level_0,original,original,original,original,original,original,perturbed,perturbed,perturbed,perturbed,perturbed,perturbed,error_mask,error_mask,error_mask,error_mask,error_mask,error_mask
Unnamed: 0_level_1,beer_name,style,abv,brewery_name,state,arbitrary_date,beer_name,style,abv,brewery_name,state,arbitrary_date,beer_name,style,abv,brewery_name,state,arbitrary_date
0,Pub Beer,American Pale Lager,0.050,10 Barrel Brewing Company,OR,2025-02-20 14:30:00,Pub Beer,American Pale Lager,-0.671726,10 Barrel Brewing Company,OR,2025-02-20 14:30:00,False,False,True,False,False,True
1,Devil's Cup,American Pale Ale (APA),0.066,18th Street Brewery,IN,2025-02-20 14:30:00,Devil's Cup,American Pale Ale (APA),-0.655726,18th Street Brewery,IN,2025-02-20 14:30:00,False,False,True,False,False,True
2,Rise of the Phoenix,American IPA,0.071,18th Street Brewery,IN,2025-02-20 14:30:00,Rise of the Phoenix,American IPA,-0.650726,18th Street Brewery,IN,2025-02-20 14:30:00,False,False,True,False,False,True
3,Sinister,American Double / Imperial IPA,0.090,18th Street Brewery,IN,2025-02-20 14:30:00,Sinister,American Double / Imperial IPA,0.090000,18th Street Brewery,IN,2025-02-20 14:30:00,False,False,False,False,False,True
4,Sex and Candy,American IPA,0.075,18th Street Brewery,IN,2025-02-20 14:30:00,Sex and Candy,American IPA,0.075000,18th Street Brewery,IN,2025-02-20 14:30:00,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2405,Belgorado,Belgian IPA,0.067,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,Belgorado,Belgian IPA,-0.654726,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,False,False,True,False,False,False
2406,Rail Yard Ale,American Amber / Red Ale,0.052,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,Rail Yard Ale,American Amber / Red Ale,-0.669726,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,False,False,True,False,False,False
2407,B3K Black Lager,Schwarzbier,0.055,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,B3K Black Lager,Schwarzbier,-0.666726,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,False,False,True,False,False,True
2408,Silverback Pale Ale,American Pale Ale (APA),0.055,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,Silverback Pale Ale,American Pale Ale (APA),-0.666726,Wynkoop Brewing Company,CO,2025-02-20 14:30:00,False,False,True,False,False,True


In [178]:
# Error rates to include - malformed -- Should throw an error - Works
# df_dirty, error_mask = high_level.create_errors(df_clean, max_error_rate=0.75, error_types_to_include=[])
# show_result(df_clean, df_dirty, error_mask)

In [179]:
print("Overall Error Rate", error_mask.mean(axis=None), "\n\nColumn-Wise Error Rates:")
error_mask.mean(axis=0)

Overall Error Rate 0.24979253112033195 

Column-Wise Error Rates:


beer_name         0.000000
style             0.000000
abv               0.749378
brewery_name      0.000000
state             0.000000
arbitrary_date    0.749378
dtype: float64

### Test of the include/exclude error mechanisms