### Example 2 - Read batches and automatically preprocess the features

In [1]:
from faucetml.data_reader import get_client

In [2]:
NUM_EPOCHS = 2
MINIBATCH_SIZE = 128

fml = get_client(
    datastore="bigquery",
    credential_path="bq_creds.json",
    table_name="gradient-decision.test_titanic.training_table",
    ds="2020-01-20",
    epochs=NUM_EPOCHS,
    batch_size=MINIBATCH_SIZE,
    chunk_size=MINIBATCH_SIZE * 5,
    table_sample_percent=100,
    test_split_percent=20,
    skip_small_batches=False,
)

In [4]:
# 1 - compute metadata about features in table and generate preprocessing specifications
# 2 - Create PyTorch net that will apply these preprocessing specifications
preproc_specs, preprocessor_net = fml.gen_preprocess_specs_and_net()

[17:05:45 INFO] Got feature: pclass
[17:05:45 INFO] Feature pclass normalization: NormalizationParameters(feature_type='ENUM', boxcox_lambda=None, boxcox_shift=0.0, mean=0.0, stddev=1.0, mode=3.0, possible_values=[1, 2, 3], quantiles=None, min_value=1.0, max_value=3.0)

[17:05:45 INFO] Got feature: sex
[17:05:45 INFO] Feature sex normalization: NormalizationParameters(feature_type='BINARY', boxcox_lambda=None, boxcox_shift=0.0, mean=0.0, stddev=1.0, mode=0.0, possible_values=None, quantiles=None, min_value=0.0, max_value=1.0)

[17:05:45 INFO] Got feature: age
[17:05:45 INFO] Feature stats: original K2: 283.9540853024901 P: 2.1885366286553843e-62 Boxcox K2: 95.95031514141382 P: 1.4610120524223594e-21
[17:05:45 INFO] Feature age normalization: NormalizationParameters(feature_type='CONTINUOUS', boxcox_lambda=None, boxcox_shift=None, mean=29.75308609008789, stddev=13.269103050231934, mode=32.36809158325195, possible_values=None, quantiles=None, min_value=0.41999998688697815, max_value=80.0

In [5]:
# example preprocessing specification for feature "age"
preproc_specs["age"]

NormalizationParameters(feature_type='CONTINUOUS', boxcox_lambda=None, boxcox_shift=None, mean=29.75308609008789, stddev=13.269103050231934, mode=32.36809158325195, possible_values=None, quantiles=None, min_value=0.41999998688697815, max_value=80.0)

In [6]:
# call once per epoch
fml.prep_for_epoch()

[17:06:18 INFO] Generating temp table with following query:
[17:06:18 INFO] select * from `gradient-decision.test_titanic.training_table` where date(_PARTITIONTIME) = '2020-01-20' and MOD(ABS(FARM_FINGERPRINT(cast(hash_on as string))), 1000) / 1000 < 0.8;
[17:06:18 INFO] Temp table generated. Took 0.74s.
[17:06:18 INFO] Epoch 1 contains 721 rows.


In [7]:
# get a batch of training data that is preprocessed
fml.get_batch(preprocess=True)

[17:06:35 INFO] Got batch 1/6 for epoch 1/2 (8%)


{'features': tensor([[ 0.0000,  0.0000, -0.3582,  ...,  0.0000,  0.0000,  1.0000],
         [ 0.0000,  1.0000, -0.6013,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000, -0.1321,  ...,  0.0000,  0.0000,  1.0000],
         ...,
         [ 0.0000,  1.0000, -0.6013,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000, -1.7147,  ...,  1.0000,  0.0000,  0.0000],
         [ 1.0000,  0.0000,  1.6766,  ...,  0.0000,  0.0000,  1.0000]]),
 'labels': {'survived': tensor([[0.],
          [1.],
          [0.],
          [0.],
          [0.],
          [1.],
          [1.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [1.],
          [1.],
          [0.],
          [1.],
          [1.],
          [0.],
          [0.],
          [1.],
          [0.],
          [1.],
          [0.],
          [1.],
          [1.],
          [0.],
          [0.],
          [0

In [8]:
# or get a batch of training data that is not preprocessed
fml.get_batch(preprocess=False)

[17:07:08 INFO] Got batch 2/6 for epoch 1/2 (17%)


{'features':      pclass  sex       age  num_siblings_or_spouses  num_children_or_parents  \
 128       3    0  32.00000                        0                        0   
 129       1    1  48.00000                        1                        0   
 130       3    0  40.50000                        0                        2   
 131       1    0  34.00000                        0                        0   
 132       3    1   2.00000                        3                        2   
 ..      ...  ...       ...                      ...                      ...   
 251       1    0  32.36809                        0                        0   
 252       1    0  60.00000                        1                        1   
 253       3    0  32.36809                        2                        0   
 254       3    0  10.00000                        3                        2   
 255       3    0  20.00000                        0                        0   
 
      family_s

In [10]:
# easily inspect the preprocessing specification for any given feature
preproc_specs["family_size"]

NormalizationParameters(feature_type='ENUM', boxcox_lambda=None, boxcox_shift=0.0, mean=0.0, stddev=1.0, mode=1.0, possible_values=[1, 2, 3, 4, 5, 6, 7, 8, 11], quantiles=None, min_value=1.0, max_value=11.0)