In [1]:
import numpy as np
import pandas as pd
import blindat as bd

### `@blindat`

The decorator `@blindat` adds blinding to functions that return a pandas DataFrame as the first or only result.  The function should accept the keyword argument `rules` or generic `**kwargs`.

In [2]:
from blindat.pyfy import blindat

# simulation params
COLUMNS = ["A", "B", "C", "D"]
NUM_ROWS = int(1e7)
DATA_SEED = 19421127

# fake data
np.random.seed(DATA_SEED)
FAKE_DATA = pd.DataFrame(np.random.rand(NUM_ROWS, len(COLUMNS)), columns=COLUMNS)

# transformation rules
rules = bd.generate_rules("A", offset=(10.0, 20.0), random_seed=42)


@blindat
def load_dataframe(**kwargs):
    return FAKE_DATA


df = load_dataframe(rules=rules)
df.head()

Unnamed: 0,A,B,C,D
0,14.264812,0.030766,0.064909,0.930325
1,14.014989,0.562393,0.227109,0.202936
2,14.114655,0.579577,0.01545,0.53417
3,14.417311,0.868601,0.142738,0.573955
4,14.648785,0.921365,0.019821,0.263312


If an experiment generates many different data files, it might be convenient to develop a custom class with a bunch of methods for accessing each component.  You can simply add the `@blindat` decorator to any methods that return a dataframe that you want to blind.

In [3]:
class Measurement:
    def __init__(self, path=None):
        self.path = path  # path to data directory

    def load_dataframe(self):
        return FAKE_DATA

    @blindat
    def blind_data(self, rules=None):
        return self.load_dataframe()


# initialize
measurement = Measurement()

In [4]:
# blind data
measurement.blind_data(rules=rules).head()

Unnamed: 0,A,B,C,D
0,14.264812,0.030766,0.064909,0.930325
1,14.014989,0.562393,0.227109,0.202936
2,14.114655,0.579577,0.01545,0.53417
3,14.417311,0.868601,0.142738,0.573955
4,14.648785,0.921365,0.019821,0.263312


In [5]:
# original data
measurement.load_dataframe().head()

Unnamed: 0,A,B,C,D
0,0.519411,0.030766,0.064909,0.930325
1,0.269587,0.562393,0.227109,0.202936
2,0.369254,0.579577,0.01545,0.53417
3,0.67191,0.868601,0.142738,0.573955
4,0.903384,0.921365,0.019821,0.263312


In [6]:
# or
measurement.blind_data(rules=None).head()

Unnamed: 0,A,B,C,D
0,0.519411,0.030766,0.064909,0.930325
1,0.269587,0.562393,0.227109,0.202936
2,0.369254,0.579577,0.01545,0.53417
3,0.67191,0.868601,0.142738,0.573955
4,0.903384,0.921365,0.019821,0.263312


This example requires the user to explicitly opt-in to blinding their data (zen of python #2).  

For consistency and to save the user a little effort you could include `default_rules` function in your data-access module.  This might be appropriate if columns with certain names always have similar values and should always be blinded.

In [7]:
# in your custom data access module
DEFAULT_SPECIFICATION = {
    "A": {"offset": (10.0, 20.0), "scale": 1.0},
}


def my_rules(random_seed=None):
    return bd.generate_rules(DEFAULT_SPECIFICATION, random_seed=random_seed)


# in your analysis notebook
measurement.blind_data(rules=my_rules(99)).head()

Unnamed: 0,A,B,C,D
0,17.242197,0.030766,0.064909,0.930325
1,16.992373,0.562393,0.227109,0.202936
2,17.09204,0.579577,0.01545,0.53417
3,17.394696,0.868601,0.142738,0.573955
4,17.62617,0.921365,0.019821,0.263312


Alternatively, hard-code the rules into a data-access class.  However, forgetting about this could be problematic.  Consider using an unambiguously named subclass and/or warnings.

In [8]:
import warnings

DEFAULT_RULES = my_rules(42)


class BlindMeasurement(Measurement):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    @blindat(default_rules=DEFAULT_RULES)
    def load_dataframe(self, *args, **kwargs):
        warnings.warn("data values may be altered to mitigate experimenter bias.")
        return super().load_dataframe(*args, **kwargs)


blind_measurement = BlindMeasurement()

# blind by default
blind_measurement.load_dataframe().head()



Unnamed: 0,A,B,C,D
0,14.264812,0.030766,0.064909,0.930325
1,14.014989,0.562393,0.227109,0.202936
2,14.114655,0.579577,0.01545,0.53417
3,14.417311,0.868601,0.142738,0.573955
4,14.648785,0.921365,0.019821,0.263312


In [9]:
# original data (overwrite default_rules)
blind_measurement.load_dataframe(rules=None).head()



Unnamed: 0,A,B,C,D
0,0.519411,0.030766,0.064909,0.930325
1,0.269587,0.562393,0.227109,0.202936
2,0.369254,0.579577,0.01545,0.53417
3,0.67191,0.868601,0.142738,0.573955
4,0.903384,0.921365,0.019821,0.263312
