In [5]:
import random

import numpy as np
from PIL import Image
import pandas as pd
import torch

try:
    import google.colab  # noqa: F401

    !pip install -q daml[torch] torchmetrics torchvision
    !export LC_ALL="en_US.UTF-8"
    !export LD_LIBRARY_PATH="/usr/lib64-nvidia"
    !export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
    !ldconfig /usr/lib64-nvidia
except Exception:
    pass

!pip install -q tabulate

import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


np.random.seed(0)
np.set_printoptions(formatter={"float": lambda x: f"{x:0.4f}"})
torch.manual_seed(0)
torch.set_float32_matmul_precision("high")
device = "cuda" if torch.cuda.is_available() else "cpu"
torch._dynamo.config.suppress_errors = True

random.seed(0)
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


import tensorflow_datasets as tfds

from daml.models.tensorflow import AE, VAEGMM, create_model
from importlib import reload

import daml._internal.detectors.duplicates as duplicates
import daml._internal.detectors.linter as linter
import daml._internal.metrics.stats as stats

reload(stats)
reload(linter)
reload(duplicates)

# # MNIST Data
import hashlib
import os
import typing
from urllib.error import HTTPError, URLError
from urllib.request import urlretrieve


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
def download_mnist() -> str:
    """Code to download mnist originates from keras/datasets:

    https://github.com/keras-team/keras/blob/v2.15.0/keras/datasets/mnist.py#L25-L86
    """
    origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
    path = _get_file(
        "mnist.npz",
        origin=origin_folder + "mnist.npz",
        file_hash=("731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1"),
    )

    return path


def _get_file(
    fname: str,
    origin: str,
    file_hash: typing.Optional[str] = None,
):
    cache_dir = os.path.join(os.path.expanduser("~"), ".keras")
    datadir_base = os.path.expanduser(cache_dir)
    if not os.access(datadir_base, os.W_OK):
        datadir_base = os.path.join("/tmp", ".keras")
    datadir = os.path.join(datadir_base, "datasets")
    os.makedirs(datadir, exist_ok=True)

    fname = os.fspath(fname) if isinstance(fname, os.PathLike) else fname
    fpath = os.path.join(datadir, fname)

    download = False
    if os.path.exists(fpath):
        if file_hash is not None and not _validate_file(fpath, file_hash):
            download = True
    else:
        download = True

    if download:
        try:
            error_msg = "URL fetch failure on {}: {} -- {}"
            try:
                urlretrieve(origin, fpath)
            except HTTPError as e:
                raise Exception(error_msg.format(origin, e.code, e.msg)) from e
            except URLError as e:
                raise Exception(error_msg.format(origin, e.errno, e.reason)) from e
        except (Exception, KeyboardInterrupt):
            if os.path.exists(fpath):
                os.remove(fpath)
            raise

        if os.path.exists(fpath) and file_hash is not None and not _validate_file(fpath, file_hash):
            raise ValueError(
                "Incomplete or corrupted file detected. "
                f"The sha256 file hash does not match the provided value "
                f"of {file_hash}.",
            )
    return fpath


def _validate_file(fpath, file_hash, chunk_size=65535):
    hasher = hashlib.sha256()
    with open(fpath, "rb") as fpath_file:
        for chunk in iter(lambda: fpath_file.read(chunk_size), b""):
            hasher.update(chunk)

    return str(hasher.hexdigest()) == str(file_hash)


mnist_path = download_mnist()

In [7]:
# Create
rng = np.random.default_rng(33)
size = 10000

with np.load(mnist_path, allow_pickle=True) as fp:
    test_images, labels = fp["x_train"][:size], fp["y_train"][:size]

norm_test_imgs = np.repeat(test_images[:, np.newaxis, :, :], 3, axis=1) / 255
jitter = rng.integers(10, size=norm_test_imgs.shape)
norm_test_imgs += jitter

In [8]:
count = 5000
# Take 5000 images and duplicate the first 10 and triplicate the first 5
lint = linter.Linter(norm_test_imgs[:count])

dupe = duplicates.Duplicates(
    np.concatenate((norm_test_imgs[:count], norm_test_imgs[:10], np.clip(norm_test_imgs[:5] * 1.001, 0.0, 10.0)))
)
results = lint.evaluate()
dupes = dupe.evaluate()

In [None]:
# User-specified inputs



In [43]:
# results.keys()
# results[464][list(results[464].keys())[0]]
results[464]

{'brightness': 0.02}

In [73]:
outliers = {}
num_outliers = 0
for idx in results:
    strs_curr = list(results[idx].keys())
    if len(strs_curr) > 0:
        num_outliers += 1

    for outlier_metric in strs_curr:
        if outlier_metric not in outliers:
            outliers[outlier_metric] = [0]
        outliers[outlier_metric][0] += 1

outlier_stats = {"total": num_outliers, "percent": round(num_outliers / count, 4)}

In [74]:
num_dupes = len(dupes["exact"])
percent_dupes = round(len(dupes["exact"]) / count, 4)

num_near = len(dupes["exact"])
percent_near = round(len(dupes["exact"]) / count, 4)

outlier_dict = outliers

In [75]:
# Create a dictionary that gradient will plot as a table
dupe_stats = {
    "num_dupes": num_dupes,
    "percent_dupes": percent_dupes,
    "num_near": num_near,
    "percent_near": percent_near,
}

In [80]:
from gradient.slide_deck.shapes import SubText, Table, Text, TextContent
from gradient.slide_deck.slidedeck import (
    DEFAULT_GRADIENT_PRESENTATION_TEMPLATE_PATH,
    DefaultGradientSlideLayouts,
    SlideDeck,
)


def generate_dupe_report_table(dupe_stats: dict) -> pd.DataFrame:
    dupe_table = pd.DataFrame(
        {
            "Number of Exact Duplicates": [str(dupe_stats["num_dupes"])],
            "Percent Exact Duplicates in Dataset": [str(dupe_stats["percent_dupes"])],
            "Number of Near Duplicates": [str(dupe_stats["num_near"])],
            "Percent Near Duplicates in Dataset": [str(dupe_stats["percent_near"])],
        }
    )
    return dupe_table


def generate_outlier_report_table(outlier_dict: dict) -> pd.DataFrame:
    return pd.DataFrame.from_dict(outlier_dict)


def generate_dupe_report_slide_kwargs(dupe_stats: dict) -> dict:
    content = [
        f"{dupe_stats['percent_dupes']*100}% of the dataset is a duplicate entry.",
    ]

    kwargs = {
        "title": "Duplicate Detection: Summary",
        "layout": DefaultGradientSlideLayouts.CONTENT_DEFAULT,
        "placeholder_fillings": [TextContent(lines=[Text(content=content)])],
        "additional_shapes": [
            Table(
                dataframe=generate_dupe_report_table(dupe_stats),
                fontsize=16,
                left=2.0,
                top=2.0,
                width=9.0,
                height=4.0,
            ),
        ],
    }
    return kwargs


def generate_outlier_report_slide_kwargs(outlier_dict: dict, outlier_stats: dict) -> dict:
    content = [
        f"{outlier_stats['total']} images in the dataset are outliers ({100*outlier_stats['percent']}% of the dataset).",
    ]

    kwargs = {
        "title": "Outlier Detection: Summary",
        "layout": DefaultGradientSlideLayouts.CONTENT_DEFAULT,
        "placeholder_fillings": [TextContent(lines=[Text(content=content)])],
        "additional_shapes": [
            Table(
                dataframe=generate_outlier_report_table(outlier_dict),
                fontsize=16,
                left=2.0,
                top=2.0,
                width=9.0,
                height=4.0,
            ),
        ],
    }
    return kwargs

In [81]:
from pathlib import Path

example_directory = Path.cwd() / "report_linting_example"
example_directory.mkdir(parents=True, exist_ok=True)

In [82]:
# Generate and add to the slide deck
deck = SlideDeck(presentation_template_path=DEFAULT_GRADIENT_PRESENTATION_TEMPLATE_PATH)

deck.add_slide(**generate_outlier_report_slide_kwargs(outlier_dict, outlier_stats))
deck.add_slide(**generate_dupe_report_slide_kwargs(dupe_stats))

In [83]:
deck.save(
    output_directory=example_directory,
    name="report_linting_example",
)

PosixPath('/workspaces/daml/prototype/report_linting_example/report_linting_example_4.pptx')