In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Use `low_level` API to create `Mojibake` in one column

In [46]:
import pandas as pd

from error_generation.api.low_level import create_errors
from error_generation.error_mechanism import ECAR
from error_generation.error_type import Butterfinger, Mislabel, MissingValue, Mojibake, Permutate, WrongUnit

In [47]:
ecar = ECAR(error_rate=1.0)

## Permutation

In [49]:
data = {"A": ["apple", "banana", "cherry", "pineapple"], "B": ["red apple", "yellow banana", "dark cherry", "blue pineapple"], "C": [10, 20, 30, 40]}
df_permutate = pd.DataFrame(data)

In [52]:
permutate = Permutate({"permutation_separator": " ", "permutation_pattern": "fixed"})

In [53]:
df_corrupted, error_mask = create_errors(df_permutate, "B", ecar, permutate)

In [54]:
df_corrupted

Unnamed: 0,A,B,C
0,apple,apple red,10
1,banana,banana yellow,20
2,cherry,cherry dark,30
3,pineapple,pineapple blue,40


## Mojibake

In [38]:
mojibake = Mojibake()

In [39]:
df_mojibake = pd.DataFrame({"a": [0, 1, 2], "b": ["Ente", "Haus", "Grünfelder Straße 17, 13357 Öppeln"]})

In [40]:
df_corrupted, error_mask = create_errors(df_mojibake, "b", ecar, mojibake)

In [41]:
df_corrupted

Unnamed: 0,a,b
0,0,Ente
1,1,Haus
2,2,"Grnfelder Strae 17, 13357 ppeln"


## Butterfinger

In [102]:
butterfinger = Butterfinger()

In [103]:
df_butterfinger = pd.DataFrame({"a": [0, 1, 2], "b": ["Entspannung", "Genugtuung", "Ausgeglichenheit"]})

In [104]:
df_corrupted, error_mask = create_errors(df_butterfinger, "b", ecar, butterfinger)

In [105]:
df_corrupted

Unnamed: 0,a,b
0,0,Entspannyng
1,1,Genugyuung
2,2,Ausgeglichenhejt


## Wrong Unit

In [97]:
wrong_unit = WrongUnit({"wrong_unit_scaling": lambda x: x / 1000})

In [98]:
df_wrong_unit = pd.DataFrame({"a": [0, 1, 2], "b": [40, 50, 60]})

In [99]:
df_corrupted, error_mask = create_errors(df_wrong_unit, 1, ecar, wrong_unit)

In [100]:
df_corrupted

Unnamed: 0,a,b
0,0,0.04
1,1,0.05
2,2,0.06


## Mislabel

In [108]:
mislabel = Mislabel()

In [109]:
df_mislabel = pd.DataFrame({"a": [1, 2, 3], "b": ["blau", "gelb", "blau"]})

In [111]:
df_mislabel["b"] = df_mislabel["b"].astype("category")

In [119]:
df_corrupted, error_mask = create_errors(df_mislabel, "b", ecar, mislabel)

In [121]:
df_corrupted

Unnamed: 0,a,b
0,1,gelb
1,2,blau
2,3,gelb


## Missing

In [124]:
missing = MissingValue()

In [125]:
df_missing = pd.DataFrame({"a": [1, 2, 3], "b": ["blau", "gelb", "blau"]})

In [126]:
df_corrupted, error_mask = create_errors(df_mislabel, "b", ecar, missing)

In [128]:
df_corrupted

Unnamed: 0,a,b
0,1,
1,2,
2,3,
