# Test of the High-Level API

For development purposes

In [73]:
%load_ext autoreload
%autoreload 2
from __future__ import annotations

import pandas as pd

from tab_err.api import high_level

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
def show_result(original_df: pd.DataFrame, perturbed_df: pd.DataFrame, error_mask: pd.DataFrame | None = None) -> pd.DataFrame:
    """Simple helper function to show DataFrames after perturbing them."""
    return (
        pd.concat([original_df, perturbed_df], keys=["original", "perturbed"], axis=1)
        if error_mask is None
        else pd.concat([original_df, perturbed_df, error_mask], keys=["original", "perturbed", "error_mask"], axis=1)
    )

In [75]:
df_typist_book_title = pd.DataFrame(
    {
        "typist": ["Alice", "Alice", "Alice", "Bob", "Bob", "Bob"],
        "book_title": ["To Kill a Mockingbird", "1984", "Pride and Prejudice", "The Great Gatsby", "Moby-Dick", "The Catcher in the Rye"],
        "rating": [1.0, 3.0, 3.0, 4.0, 2.0, 1.0],
    }
)



print(df_typist_book_title.dtypes)

typist         object
book_title     object
rating        float64
dtype: object


### TEST

In [76]:
df_corrupted, error_mask = high_level.create_errors(df_typist_book_title, 0.5)
show_result(df_typist_book_title, df_corrupted, error_mask)

Column-type dict:  {'typist': [<tab_err.error_type._extraneous.Extraneous object at 0x7f4d9a90ce20>, <tab_err.error_type._mojibake.Mojibake object at 0x7f4d9a90c490>, <tab_err.error_type._replace.Replace object at 0x7f4d9a90f7c0>, <tab_err.error_type._typo.Typo object at 0x7f4d9a90c520>, <tab_err.error_type._missing.MissingValue object at 0x7f4d9a90c0d0>], 'book_title': [<tab_err.error_type._extraneous.Extraneous object at 0x7f4d9a90ce20>, <tab_err.error_type._mojibake.Mojibake object at 0x7f4d9a90c490>, <tab_err.error_type._replace.Replace object at 0x7f4d9a90f7c0>, <tab_err.error_type._typo.Typo object at 0x7f4d9a90c520>, <tab_err.error_type._missing.MissingValue object at 0x7f4d9a90c0d0>], 'rating': [<tab_err.error_type._add_delta.AddDelta object at 0x7f4d9a90d2a0>, <tab_err.error_type._outlier.Outlier object at 0x7f4d9a90de70>, <tab_err.error_type._wrong_unit.WrongUnit object at 0x7f4d9a90f430>, <tab_err.error_type._missing.MissingValue object at 0x7f4d9a90c0d0>]}
Column-mech dict:

  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)
  col_error_rates = build_column_error_rate_dictionary(data, max_error_rate, col_num_models)


Unnamed: 0_level_0,original,original,original,perturbed,perturbed,perturbed,error_mask,error_mask,error_mask
Unnamed: 0_level_1,typist,book_title,rating,typist,book_title,rating,typist,book_title,rating
0,Alice,To Kill a Mockingbird,1.0,Alice,To Kill a Mockingbird,1.0,False,False,False
1,Alice,1984,3.0,Alice,1984,3.0,False,False,False
2,Alice,Pride and Prejudice,3.0,Alice,Pride and Prejudice,3.0,False,False,False
3,Bob,The Great Gatsby,4.0,Bob,The Great Gatsby,4.0,False,False,False
4,Bob,Moby-Dick,2.0,Bob,Moby-Dick,2.0,False,False,False
5,Bob,The Catcher in the Rye,1.0,Bob,The Catcher in the Rye,1.0,False,False,False


# Testing of High Level API with a large dataset

In [77]:
# Read in data
df_clean = pd.read_csv("./clean_beers.csv")
df_clean.head()

Unnamed: 0,index,id,beer_name,style,ounces,abv,ibu,brewery_id,brewery_name,city,state
0,1,1436,Pub Beer,American Pale Lager,12.0,0.05,,408,10 Barrel Brewing Company,Bend,OR
1,2,2265,Devil's Cup,American Pale Ale (APA),12.0,0.066,,177,18th Street Brewery,Gary,IN
2,3,2264,Rise of the Phoenix,American IPA,12.0,0.071,,177,18th Street Brewery,Gary,IN
3,4,2263,Sinister,American Double / Imperial IPA,12.0,0.09,,177,18th Street Brewery,Gary,IN
4,5,2262,Sex and Candy,American IPA,12.0,0.075,,177,18th Street Brewery,Gary,IN


In [78]:
df_clean = df_clean.drop(columns = ["index", "id", "ounces", "ibu", "brewery_id", "city"])
df_clean.head()

Unnamed: 0,beer_name,style,abv,brewery_name,state
0,Pub Beer,American Pale Lager,0.05,10 Barrel Brewing Company,OR
1,Devil's Cup,American Pale Ale (APA),0.066,18th Street Brewery,IN
2,Rise of the Phoenix,American IPA,0.071,18th Street Brewery,IN
3,Sinister,American Double / Imperial IPA,0.09,18th Street Brewery,IN
4,Sex and Candy,American IPA,0.075,18th Street Brewery,IN


In [79]:
df_clean["beer_name"] = df_clean["beer_name"].astype(str)
df_clean["style"] = df_clean["style"].astype(str)
df_clean["brewery_name"] = df_clean["brewery_name"].astype(str)
df_clean["state"] = df_clean["state"].astype(str)

In [80]:
df_dirty, error_mask = high_level.create_errors(df_clean, max_error_rate=0.75)
show_result(df_clean, df_dirty, error_mask)

Column-type dict:  {'beer_name': [<tab_err.error_type._extraneous.Extraneous object at 0x7f4d9a90dfc0>, <tab_err.error_type._mojibake.Mojibake object at 0x7f4d9a90cf40>, <tab_err.error_type._replace.Replace object at 0x7f4d9a90e650>, <tab_err.error_type._typo.Typo object at 0x7f4d9a90f820>, <tab_err.error_type._missing.MissingValue object at 0x7f4d9a90c700>], 'style': [<tab_err.error_type._extraneous.Extraneous object at 0x7f4d9a90dfc0>, <tab_err.error_type._mojibake.Mojibake object at 0x7f4d9a90cf40>, <tab_err.error_type._replace.Replace object at 0x7f4d9a90e650>, <tab_err.error_type._typo.Typo object at 0x7f4d9a90f820>, <tab_err.error_type._missing.MissingValue object at 0x7f4d9a90c700>], 'abv': [<tab_err.error_type._add_delta.AddDelta object at 0x7f4d9a90e5c0>, <tab_err.error_type._outlier.Outlier object at 0x7f4d9a90dc90>, <tab_err.error_type._wrong_unit.WrongUnit object at 0x7f4d9a90df30>, <tab_err.error_type._missing.MissingValue object at 0x7f4d9a90c700>], 'brewery_name': [<tab_

Unnamed: 0_level_0,original,original,original,original,original,perturbed,perturbed,perturbed,perturbed,perturbed,error_mask,error_mask,error_mask,error_mask,error_mask
Unnamed: 0_level_1,beer_name,style,abv,brewery_name,state,beer_name,style,abv,brewery_name,state,beer_name,style,abv,brewery_name,state
0,Pub Beer,American Pale Lager,0.050,10 Barrel Brewing Company,OR,Pub Beer,American Pale Lagrr,0.050,10 Barrel Brewing Company,OR,True,True,False,False,True
1,Devil's Cup,American Pale Ale (APA),0.066,18th Street Brewery,IN,,American Pale Ale (APA),0.660,18th Street Brewery,IJ,True,True,True,False,True
2,Rise of the Phoenix,American IPA,0.071,18th Street Brewery,IN,,American IPA,0.071,18th Street Brewerz,IN,True,False,False,True,True
3,Sinister,American Double / Imperial IPA,0.090,18th Street Brewery,IN,Sinister,American Double / Im-erial IPA,0.090,18th Street Breaery,IN,True,True,False,True,False
4,Sex and Candy,American IPA,0.075,18th Street Brewery,IN,Sex and Candy,"A,erican IPA",0.075,18th Street Brewerz,IN,True,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2405,Belgorado,Belgian IPA,0.067,Wynkoop Brewing Company,CO,Belgorado,Belgoan IPA,0.670,Wynkoop Brewing Company,CO,True,True,True,True,False
2406,Rail Yard Ale,American Amber / Red Ale,0.052,Wynkoop Brewing Company,CO,Rail Yard Ale,.American Amber / Red Ale,0.052,Wynkoop Brewing Cojpany,CO,True,True,False,True,True
2407,B3K Black Lager,Schwarzbier,0.055,Wynkoop Brewing Company,CO,B3K Black Lager,Schwarzbier,0.055,Wznkoop Brewing Companz,CO,False,False,False,True,False
2408,Silverback Pale Ale,American Pale Ale (APA),0.055,Wynkoop Brewing Company,CO,Silverback Pale Ale,.American Pale Ale (APA),0.055,Wznkoop Brewing Companz,CP,True,True,False,True,True


In [81]:
error_mask.mean(axis=None)
error_mask.mean(axis=0)

beer_name       0.746888
style           0.746888
abv             0.746888
brewery_name    0.746888
state           0.746888
dtype: float64