In [None]:
# user managed notebook = TensorFlow Enterprise 2.6 withou GPU
# install tfdv running this in terminal: pip install --user tensorflow-data-validation
# if data lives in GCS: pip install --user 'google.cloud.storage==1.44.0'

In [None]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils import slicing_util

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from google.cloud import storage

In [None]:
#####################################################################
#
# if your data is in GCS....
#
#####################################################################

In [None]:
# set params
PROJECT_ID = "my-project"
GCS_BUCKET = "my-bucket"
REGION = "us-central1"

# get training data
gcs_client = storage.Client(project=PROJECT_ID)
bucket = gcs_client.get_bucket(GCS_BUCKET)
blob_name = "tabular_binary_class_even_split_slim.csv"
blob = bucket.blob(blob_name)
blob.download_to_filename(blob_name)

print("Downloaded blob {} to {}.".format(blob.name, blob_name))

In [None]:
#####################################################################
#
# for this exercise, generate synthetic data using sklearn
#
#####################################################################

In [None]:
# create "training" data
x, y = make_classification(n_classes=2
                           , n_samples=10000
                           , n_features=5
                           , n_informative=5
                           , n_redundant=0
                           , n_repeated=0
                           , random_state=34098)

training_data = np.concatenate((x, y[:,None]), axis=1)
num_cols = x.shape[1]
col_names = ','.join([f"x_{i}" for i in range(num_cols)] + ['label'])
col_names

In [None]:
# save training data to csv
train_data_file = 'train.csv'
np.savetxt( train_data_file
           , training_data
           , delimiter=','
           , fmt='%f'
           , header=col_names
           , comments="")

In [None]:
# generate synthetic eval data from a totally diff distribution
eval_data_file = 'eval.csv'

eval_data = np.random.rand(10000,5)+100

num_cols = eval_data.shape[1]
col_names = ','.join([f"x_{i}" for i in range(num_cols)])
print(col_names)

np.savetxt(  eval_data_file
           , eval_data
           , delimiter=','
           , fmt='%f'
           , header=col_names
           , comments="")

In [None]:
TRAIN_DATA = train_data_file
EVAL_DATA = eval_data_file

In [None]:
train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)

In [None]:
tfdv.visualize_statistics(train_stats)

In [None]:
# generate schema from training data
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

In [None]:
# Compute stats for evaluation data
eval_stats = tfdv.generate_statistics_from_csv(data_location=EVAL_DATA)

In [None]:
# Compare evaluation data with training data
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [None]:
#####################################################################
#
#  Schema validation
#
#####################################################################

In [None]:
# Check eval data for errors by validating against previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [None]:
#####################################################################
#
#  Jensen-Shannon divergence: Drift
#
#####################################################################

In [None]:
# with no comparators set, no anomalies are detected
skew_anomalies = tfdv.validate_statistics(train_stats, schema, previous_statistics=eval_stats)
tfdv.display_anomalies(skew_anomalies)

In [None]:
# set comparators for drift (current versus previous)
x_0 = tfdv.get_feature(schema, 'x_0')
x_0.drift_comparator.jensen_shannon_divergence.threshold = 0.001

#note: previous_statistics in next line relates to drift_comparator above
skew_anomalies = tfdv.validate_statistics(train_stats, schema, previous_statistics=eval_stats)
tfdv.display_anomalies(skew_anomalies)

In [None]:
# set comparators for drift (current versus previous)
x_0 = tfdv.get_feature(schema, 'x_0')
x_0.drift_comparator.jensen_shannon_divergence.threshold = 1.0

x_1 = tfdv.get_feature(schema, 'x_1')
x_1.drift_comparator.jensen_shannon_divergence.threshold = 0.1

x_2 = tfdv.get_feature(schema, 'x_2')
x_2.drift_comparator.jensen_shannon_divergence.threshold = 0.001

x_3 = tfdv.get_feature(schema, 'x_3')
x_3.drift_comparator.jensen_shannon_divergence.threshold = 0.0001

x_4 = tfdv.get_feature(schema, 'x_4')
x_4.drift_comparator.jensen_shannon_divergence.threshold = 0.00001

skew_anomalies = tfdv.validate_statistics(train_stats, schema, previous_statistics=eval_stats)
tfdv.display_anomalies(skew_anomalies)

In [None]:
#####################################################################
#
#  Jensen-Shannon divergence: Skew
#
#####################################################################

In [None]:
# set comparators for skew (training versus serving)
x_0 = tfdv.get_feature(schema, 'x_0')
x_0.skew_comparator.jensen_shannon_divergence.threshold = 0.001

#note: serving_statistics in next line relates to skew_comparator above
skew_anomalies = tfdv.validate_statistics(train_stats, schema, serving_statistics=eval_stats)
tfdv.display_anomalies(skew_anomalies)

In [None]:
# set comparators for skew (training versus serving)
x_0 = tfdv.get_feature(schema, 'x_0')
x_0.skew_comparator.jensen_shannon_divergence.threshold = 1.0

x_1 = tfdv.get_feature(schema, 'x_1')
x_1.skew_comparator.jensen_shannon_divergence.threshold = 0.1

x_2 = tfdv.get_feature(schema, 'x_2')
x_2.skew_comparator.jensen_shannon_divergence.threshold = 0.001

x_3 = tfdv.get_feature(schema, 'x_3')
x_3.skew_comparator.jensen_shannon_divergence.threshold = 0.0001

x_4 = tfdv.get_feature(schema, 'x_4')
x_4.skew_comparator.jensen_shannon_divergence.threshold = 0.00001

#note: serving_statistics in next line relates to skew_comparator above
skew_anomalies = tfdv.validate_statistics(train_stats, schema, serving_statistics=eval_stats)
tfdv.display_anomalies(skew_anomalies)