In [30]:
from faucetml.data_reader import get_batch_reader

### Example 1 - Read mini-batches from bigquery (no feature store or preprocessing)

Assumes data stored in Bigquery table with schema:

| hash_on (NUMERIC or STRING) |       features (STRUCT)       |  labels (STRUCT) |
|-----------------------------|-------------------------------|------------------|
|        231248228319         | {"age": 16, "ctr": 0.021, ...} |  {"clicked": 0}  |
|        913672219001         | {"age": 33, "ctr": 0.056, ...} |  {"clicked": 0}  |

Note: hash_on is used to conduct consistent sampling & traning / test splitting. In BQ simply use 
something like `select rand() * 100000 as hash_on, ...` to create the hash_on column.

In [34]:
NUM_EPOCHS = 2
MINIBATCH_SIZE = 128

batch_cli = get_batch_reader(
    datastore="bigquery",
    credential_path="bq_creds.json",
    table_name="gradient-decision.test_titanic.training_table",
    ds="2020-01-20",
    epochs=NUM_EPOCHS,
    batch_size=MINIBATCH_SIZE,
    chunk_size=MINIBATCH_SIZE * 5,
    table_sample_percent=100,
    test_split_percent=20,
    skip_small_batches=False,
)

In [32]:
# call once per epoch
batch_cli.prep_for_epoch()

[14:01:29 INFO] Generating temp table with following query:
[14:01:29 INFO] select * from `gradient-decision.test_titanic.training_table` where date(_PARTITIONTIME) = '2020-01-20' and MOD(ABS(FARM_FINGERPRINT(cast(hash_on as string))), 1000) / 1000 < 0.8;
[14:01:30 INFO] Temp table generated. Took 0.54s.
[14:01:30 INFO] Epoch 1 contains 721 rows.


In [33]:
# get one batch of training data
batch_cli.get_batch()

[14:01:31 INFO] Got batch 1/6 for epoch 1/2 (8%)


{'features_df':      pclass  sex        age  num_siblings_or_spouses  num_children_or_parents  \
 0         3    0  25.000000                        0                        0   
 1         3    1  21.773973                        0                        0   
 2         1    0  28.000000                        1                        0   
 3         3    0   4.574167                        3                        1   
 4         3    1   9.000000                        3                        2   
 ..      ...  ...        ...                      ...                      ...   
 123       2    1  32.000000                        0                        0   
 124       3    0  48.000000                        0                        0   
 125       2    1  21.773973                        0                        0   
 126       3    0   7.000000                        4                        1   
 127       1    0  52.000000                        1                        1   
 

In [35]:
# or run through the whole data set

for i in range(NUM_EPOCHS):

    # training
    batch_cli.prep_for_epoch()
    mini_batch = batch_cli.get_batch()
    while mini_batch is not None:
        mini_batch = batch_cli.get_batch()
        # model.train(mini_batch)
        
    # eval
    batch_cli.prep_for_eval()
    mini_batch = batch_cli.get_batch(eval=True) 
    while mini_batch is not None:
        mini_batch = batch_cli.get_batch()
        # model.eval(mini_batch)


[14:01:36 INFO] Generating temp table with following query:
[14:01:36 INFO] select * from `gradient-decision.test_titanic.training_table` where date(_PARTITIONTIME) = '2020-01-20' and MOD(ABS(FARM_FINGERPRINT(cast(hash_on as string))), 1000) / 1000 < 0.8;
[14:01:36 INFO] Temp table generated. Took 0.5s.
[14:01:36 INFO] Epoch 1 contains 721 rows.
[14:01:37 INFO] Got batch 1/6 for epoch 1/2 (8%)
[14:01:37 INFO] Got batch 2/6 for epoch 1/2 (17%)
[14:01:37 INFO] Got batch 3/6 for epoch 1/2 (25%)
[14:01:37 INFO] Got batch 4/6 for epoch 1/2 (33%)
[14:01:37 INFO] Got batch 5/6 for epoch 1/2 (42%)
[14:01:37 INFO] Got batch 6/6 for epoch 1/2 (50%)
[14:01:37 INFO] Generating temp table with following query:
[14:01:37 INFO] select * from `gradient-decision.test_titanic.training_table` where date(_PARTITIONTIME) = '2020-01-20' and MOD(ABS(FARM_FINGERPRINT(cast(hash_on as string))), 1000) / 1000 between 0.8 and 1.0
[14:01:37 INFO] Temp table generated. Took 0.4s.
[14:01:37 INFO] *******************