# Goal

In this tutorial we'll generate a dataset and focus on reproducability.These are the steps:

1. Generate dataset of tfrecords
2. Store information on HOW they were generated
3. Track dataset using DVC
4. Push dataset to an S3 remote (definiens network share would also work)
5. Access immutable dataset on S3 via repo_url, commit_hash and rel_path

## Generate dataset of tfrecords

In [5]:
from pathlib import Path
DATA_DIRPATH = Path().resolve() / "data"
DATA_DIRPATH.mkdir(exist_ok=True)
DATA_DIRPATH

PosixPath('/home/ubuntu/git/litte_wandb/data')

In [8]:
# see https://gitlab.img.astrazeneca.net/RA/digipath-sdk/-/tree/master/src/simple-tfrecords
import numpy as np
from az_cp_simple_tfrecords import features, encode
import tensorflow as tf

def _create_dummy_record():
    x = np.random.randint(0,256,size=(1,2,3), dtype=np.uint8)
    y = np.random.randint(0,10_000,size=(1,2,2,3), dtype=np.uint32)

    dct = {
        "od_value": features.Float32Feature(1.23),
        "bounding_boxes": features.Float32ListFeature([0., 0., 100.,200.]),
        "cell_count": features.Int64Feature(2**60),
        "slide_name": features.StringFeature("critical slide"),
        "low_resolution_image": features.TFPNGFeature(x),
        "multichannel_image": features.TFArrayFeature(tf.convert_to_tensor(y)),
    }
    example = encode(dct)
    return example.SerializeToString()


# We'll create 3 tfrecords file with 5 records each.
for i in range(3):
    tfrecord_path= DATA_DIRPATH / f"train_{i}.tfrecord"
    with tf.io.TFRecordWriter(str(tfrecord_path)) as writer:
        for _ in range(5):
            record = _create_dummy_record()
            writer.write(record)


## Store information on HOW data was generated

In [11]:
from little_wandb import dump_infos

dump_infos(DATA_DIRPATH / "repro")

unable to save code -- program entry not found


In [18]:
for p in reversed(sorted(DATA_DIRPATH.glob("repro/*"))):
    print(f"\n\n### Content of {p}")
    print(p.read_text())




### Content of /home/ubuntu/git/litte_wandb/data/repro/wandb-metadata.json
{
    "os": "Linux-5.19.0-1024-aws-x86_64-with-glibc2.35",
    "python": "3.10.6",
    "heartbeatAt": "2023-05-11T14:34:37.579951",
    "startedAt": "2023-05-11T14:34:37.577779",
    "docker": null,
    "cuda": null,
    "args": null,
    "state": "running",
    "program": "<python with no main file>",
    "git": {
        "remote": "git@gitlab.img.astrazeneca.net:RA/experimental/litte_wandb.git",
        "commit": "bb9d942d7480a21380594da0be0a9bfff624f4b1"
    },
    "email": "",
    "root": "/home/ubuntu/git/litte_wandb",
    "host": "cgebbe-r6id.img.astrazeneca.net",
    "username": "ubuntu",
    "executable": "/home/ubuntu/git/litte_wandb/venv/bin/python"
}



### Content of /home/ubuntu/git/litte_wandb/data/repro/requirements.txt
ansiwrap==0.8.4
appdirs==1.4.4
asttokens==2.2.1
attrs==23.1.0
backcall==0.2.0
build==0.10.0
certifi==2023.5.7
charset-normalizer==3.1.0
click==8.1.3
comm==0.1.3
debugpy==1.6.7
de

## Track files using DVC

In [20]:
!dvc init

Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mhttps://dvc.org/doc/user-guide/analytics[39m>              [31m|[0m
[31m|[0m                                                                     [31m|[0m
[31m+---------------------------------------------------------------------+
[0m
[33mWhat's next?[39m
[33m------------[39m
- Check out the documentation: <[36mhttps://dvc.org/doc[39m>
- Get help and share ideas: <[36mhttps://dvc.org/chat[39m>
- Star us on GitHub: <[36mhttps://github.com/iterative/dvc[39m>
[0m

In [22]:
# Use symlinks to cache instead of copies.
!dvc config cache.type symlink

# Run `git add` automatically after each `dvc add`.
!dvc config core.autostage true

# list current config
!dvc config --list --show-origin

[0m[0m.dvc/config	cache.type=symlink
.dvc/config	core.autostage=true
[0m

In [23]:
# track files using DVC. This does two things:
# 1. It moves original files to .dvc/cache and replaces them with symlinks to .dvc/cache
# 2. It adds all original files to .gitignore
# 3. It creates a new file `data.dvc`, containing pointers to the cache. This file should be tracked by git!!!
!dvc add data

[?25l                                                                          [32m⠋[0m Checking graph
Adding...                                                                       
![A
Building data objects from data                       |0.00 [00:00,      ?obj/s][A
                                                                                [A
![A
Building data objects from data                       |0.00 [00:00,      ?obj/s][A
                                                                                [A
![A
  0% Checking cache in '/home/ubuntu/git/litte_wandb/.dvc/cache'| |0/? [00:00<?,[A
                                                                                [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
  0%|          |Transferring                          0/6 [00:00<?,     ?file/s][A
  0%|          |Transferring                    6/? [00:00<00:00, 4034.92file/s][A

![A[A

  0%|          |memory://.9

In [26]:
!dvc list data --dvc-only --recursive

data/repro/diff.patch
data/repro/requirements.txt
data/repro/wandb-metadata.json
data/train_0.tfrecord
data/train_1.tfrecord
data/train_2.tfrecord
[0m

## Push data to an S3 bucket

In [24]:
# add a S3 remote 
!dvc remote add --default mys3 s3://az-data-xfer/cgebbe/dvc_tutorial

Setting 'mys3' as a default remote.
[0m

In [18]:
# setup access keys in untracked .env file
import dotenv
import os

dotenv.load_dotenv()

for key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"]:
    assert key in os.environ

In [30]:
!dvc push

  0% Transferring|                                   |0/6 [00:00<?,     ?file/s]
![A
  0%|          |/home/ubuntu/git/litte_wandb/.dvc/c0.00/? [00:00<?,        ?B/s][A
  0%|          |/home/ubuntu/git/litte_wandb/.d0.00/1.94k [00:00<?,        ?B/s][A
                                                                                [A
![A
  0%|          |/home/ubuntu/git/litte_wandb/.dvc/c0.00/? [00:00<?,        ?B/s][A
  0%|          |/home/ubuntu/git/litte_wandb/.d0.00/1.94k [00:00<?,        ?B/s][A

![A[A

  0%|          |/home/ubuntu/git/litte_wandb/.dvc/c0.00/? [00:00<?,        ?B/s][A[A

  0%|          |/home/ubuntu/git/litte_wandb/.d0.00/1.15k [00:00<?,        ?B/s][A[A


![A[A[A


  0%|          |/home/ubuntu/git/litte_wandb/.dvc/c0.00/? [00:00<?,        ?B/s][A[A[A


  0%|          |/home/ubuntu/git/litte_wandb/.d0.00/1.94k [00:00<?,        ?B/s][A[A[A



![A[A[A[A



  0%|          |/home/ubuntu/git/litte_wandb/.dvc/c0.00/? [00:00<?,        ?B/s][A[A

## Access immutable dataset via repo_url, commit_hash and rel_path

In [34]:
# As mentioned above, DVC creates a `data.dvc` file, which should be added to git.
!git add .
!git commit -m "feat: Added tfrecord dataset tracked by DVC"
!git push --set-upstream origin geb-dvc-tutorial

Enumerating objects: 12, done.
Counting objects: 100% (12/12), done.
Delta compression using up to 8 threads
Compressing objects: 100% (8/8), done.
Writing objects: 100% (10/10), 4.24 KiB | 4.24 MiB/s, done.
Total 10 (delta 1), reused 0 (delta 0), pack-reused 0
remote: 
remote: To create a merge request for geb-dvc-tutorial, visit:[K
remote:   https://gitlab.img.astrazeneca.net/RA/experimental/litte_wandb/-/merge_requests/new?merge_request%5Bsource_branch%5D=geb-dvc-tutorial[K
remote: 
To gitlab.img.astrazeneca.net:RA/experimental/litte_wandb.git
 * [new branch]      geb-dvc-tutorial -> geb-dvc-tutorial
Branch 'geb-dvc-tutorial' set up to track remote branch 'geb-dvc-tutorial' from 'origin'.


In [8]:
# Preparation: get current commit-hash and remote URL
def _parse(s):
    return s[0]


x = !git config --get remote.origin.url
REPO_URL = _parse(x)

x = !git rev-parse HEAD
COMMIT = _parse(x)

RELPATH = "/data"

print(f"{REPO_URL=}, {COMMIT=}")

REPO_URL='git@gitlab.img.astrazeneca.net:RA/experimental/litte_wandb.git', COMMIT='0da1d6eaa58b08deb880dde85c6d3b7226b0092c'


In [21]:
# Assume that the following happens on a completely different machine!
# FIXME: DVC fails silently if AWS_ACCESS_KEY is not defined
import dvc.api

def _get_remote_paths(repo_url, commit, relpath):
    fs = dvc.api.DVCFileSystem(url=repo_url, rev=commit)
    return {path: dvc.api.get_url(path=path, repo=fs.repo_url, rev=commit)
            for path in fs.find(relpath, detail=False, dvc_only=True, maxdepth=1)
    }

dct = _get_remote_paths(REPO_URL, COMMIT, RELPATH)
dct

{'/data/train_0.tfrecord': 's3://az-data-xfer/cgebbe/dvc_tutorial/1c/e7928329620af70dbebb540e45e627',
 '/data/train_1.tfrecord': 's3://az-data-xfer/cgebbe/dvc_tutorial/07/a99ec680150604a7f8e6e5664a15d8',
 '/data/train_2.tfrecord': 's3://az-data-xfer/cgebbe/dvc_tutorial/e7/20424344404b298f412655da3ec57d'}

In [32]:
# In that remote machine, let's directly use these S3 filepaths in tensorflow
import tensorflow as tf
import tensorflow_io as tfio  # import necessary to support S3
import az_cp_simple_tfrecords


ds = tf.data.TFRecordDataset(list(dct.values()))
schema = az_cp_simple_tfrecords.infer_description_from_dataset(ds)

ds = ds.map(lambda r: az_cp_simple_tfrecords.decode(r, schema))
for tensor_dict in ds.take(1):
    pprint(tensor_dict)

{'bounding_boxes': SparseTensor(indices=tf.Tensor(
[[0]
 [1]
 [2]
 [3]], shape=(4, 1), dtype=int64), values=tf.Tensor([  0.   0. 100. 200.], shape=(4,), dtype=float32), dense_shape=tf.Tensor([4], shape=(1,), dtype=int64)),
 'cell_count': <tf.Tensor: shape=(), dtype=int64, numpy=1152921504606846976>,
 'low_resolution_image': <tf.Tensor: shape=(1, 2, 3), dtype=uint8, numpy=
array([[[234, 237, 225],
        [ 30, 233,  67]]], dtype=uint8)>,
 'multichannel_image': <tf.Tensor: shape=(1, 2, 2, 3), dtype=uint32, numpy=
array([[[[3716, 4290, 6018],
         [8552, 1570, 3700]],

        [[1555, 9442, 2024],
         [2135, 9196, 9416]]]], dtype=uint32)>,
 'od_value': <tf.Tensor: shape=(), dtype=float32, numpy=1.23>,
 'slide_name': <tf.Tensor: shape=(), dtype=string, numpy=b'critical slide'>}


In [33]:
# only for future references: versions of tensorflow and tensorflow-io used...
!pip freeze

absl-py==1.4.0
aiobotocore==2.5.0
aiohttp==3.8.4
aiohttp-retry==2.8.3
aioitertools==0.11.0
aiosignal==1.3.1
amqp==5.1.1
ansiwrap==0.8.4
antlr4-python3-runtime==4.9.3
appdirs==1.4.4
assertpy==1.1
asttokens==2.2.1
astunparse==1.6.3
async-timeout==4.0.2
asyncssh==2.13.1
atpublic==3.1.1
attrs==23.1.0
az-cp-simple-tfrecords==0.1.1
backcall==0.2.0
billiard==3.6.4.0
boto3==1.26.76
botocore==1.29.76
build==0.10.0
cachetools==5.3.0
celery==5.2.7
certifi==2023.5.7
cffi==1.15.1
charset-normalizer==3.1.0
click==8.1.3
click-didyoumean==0.3.0
click-plugins==1.1.1
click-repl==0.2.0
colorama==0.4.6
comm==0.1.3
configobj==5.0.8
cryptography==40.0.2
debugpy==1.6.7
decorator==5.1.1
dictdiffer==0.9.0
diskcache==5.6.1
distro==1.8.0
docker-pycreds==0.4.0
dpath==2.1.5
dulwich==0.21.5
dvc==2.56.0
dvc-data==0.47.5
dvc-http==2.30.2
dvc-objects==0.21.2
dvc-render==0.4.0
dvc-s3==2.22.0
dvc-studio-client==0.9.0
dvc-task==0.2.1
entrypoints==0.4
executing==1.2.0
fastjsonschema==2.16.3
filelock==3.12.0
flatbuffers==2